Made dictionaries Theadsafe

This commit is contained in:
deiflaender 2020-12-09 13:14:04 +01:00
parent 3f69030b03
commit 44613ee117
10 changed files with 214 additions and 83 deletions

View File

@ -27,4 +27,6 @@ public class Document {
private List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
private SectionGrid sectionGrid = new SectionGrid();
private long dictionaryVersion;
private long rulesVersion;
}

View File

@ -18,7 +18,6 @@ import com.iqser.red.service.redaction.v1.resources.RedactionResource;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
@ -40,7 +39,6 @@ public class RedactionController implements RedactionResource {
private final EntityRedactionService entityRedactionService;
private final PdfFlattenService pdfFlattenService;
private final DroolsExecutionService droolsExecutionService;
private final DictionaryService dictionaryService;
@Override
@ -57,11 +55,11 @@ public class RedactionController implements RedactionResource {
if (redactionRequest.isFlatRedaction()) {
PDDocument flatDocument = pdfFlattenService.flattenPDF(pdDocument);
return convert(flatDocument, classifiedDoc.getPages()
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid());
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());
}
return convert(pdDocument, classifiedDoc.getPages()
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid());
.size(), classifiedDoc.getRedactionLogEntities(), classifiedDoc.getSectionGrid(), classifiedDoc.getDictionaryVersion(), classifiedDoc.getRulesVersion());
} catch (IOException e) {
throw new RedactionException(e);
@ -142,21 +140,20 @@ public class RedactionController implements RedactionResource {
private RedactionResult convert(PDDocument document, int numberOfPages) throws IOException {
return convert(document, numberOfPages, null, null);
return convert(document, numberOfPages, null, null, 0, 0);
}
private RedactionResult convert(PDDocument document, int numberOfPages,
List<RedactionLogEntry> redactionLogEntities,
SectionGrid sectionGrid) throws IOException {
SectionGrid sectionGrid, long dictionaryVersion, long rulesVersion) throws IOException {
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
document.save(byteArrayOutputStream);
return RedactionResult.builder()
.document(byteArrayOutputStream.toByteArray())
.numberOfPages(numberOfPages)
.redactionLog(new RedactionLog(redactionLogEntities, dictionaryService.getDictionaryVersion(), droolsExecutionService
.getRulesVersion()))
.redactionLog(new RedactionLog(redactionLogEntities,dictionaryVersion, rulesVersion))
.sectionGrid(sectionGrid)
.build();
}

View File

@ -0,0 +1,88 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.Data;
import lombok.Getter;
@Data
public class Dictionary {
public static final String RECOMMENDATION_PREFIX = "recommendation_";
@Getter
private List<DictionaryModel> dictionaryModels;
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
@Getter
private long version;
public Dictionary(List<DictionaryModel> dictionaryModels, long dictionaryVersion){
this.dictionaryModels = dictionaryModels;
this.dictionaryModels.forEach(dm -> localAccessMap.put(dm.getType(), dm));
this.version = dictionaryVersion;
}
public boolean isRecommendation(String type) {
DictionaryModel model = localAccessMap.get(type);
if (model != null) {
return model.isRecommendation();
}
return false;
}
public boolean hasLocalEntries() {
return dictionaryModels.stream().anyMatch(dm -> !dm.getLocalEntries().isEmpty());
}
public Set<String> getTypes() {
return localAccessMap.keySet();
}
public boolean containsValue(String type, String value) {
if (localAccessMap.containsKey(type) && localAccessMap.get(type)
.getEntries()
.contains(value) || localAccessMap.containsKey(type) && localAccessMap.get(type)
.getLocalEntries()
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
.getEntries()
.contains(value) || localAccessMap.containsKey(RECOMMENDATION_PREFIX + type) && localAccessMap.get(RECOMMENDATION_PREFIX + type)
.getLocalEntries()
.contains(value)) {
return true;
}
return false;
}
public boolean isHint(String type) {
DictionaryModel model = localAccessMap.get(type);
if (model != null) {
return model.isHint();
}
return false;
}
public boolean isCaseInsensitiveDictionary(String type) {
DictionaryModel dictionaryModel = localAccessMap.get(type);
if (dictionaryModel != null) {
return dictionaryModel.isCaseInsensitive();
}
return false;
}
}

View File

@ -1,6 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.io.Serializable;
import java.util.Set;
import lombok.AllArgsConstructor;
@ -8,7 +9,7 @@ import lombok.Data;
@Data
@AllArgsConstructor
public class DictionaryModel {
public class DictionaryModel implements Serializable {
private String type;
private int rank;

View File

@ -1,8 +1,9 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import static com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService.RECOMMENDATION_PREFIX;
import static com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary.RECOMMENDATION_PREFIX;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@ -12,7 +13,6 @@ import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.Builder;
@ -24,7 +24,12 @@ import lombok.extern.slf4j.Slf4j;
@Builder
public class Section {
private DictionaryService dictionaryService;
private boolean isLocal;
private Set<String> dictionaryTypes;
@Builder.Default
private Map<String, Set<String>> localDictionaryAdds = new HashMap<>();
private Set<Entity> entities;
@ -66,7 +71,7 @@ public class Section {
public void redact(String type, int ruleNumber, String reason, String legalBasis) {
boolean hasRecommendactionDictionary = dictionaryService.hasRecommendationDictionary(type);
boolean hasRecommendactionDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
entities.forEach(entity -> {
if (entity.getType().equals(type) || hasRecommendactionDictionary && entity.getType()
@ -82,7 +87,7 @@ public class Section {
public void redactNot(String type, int ruleNumber, String reason) {
boolean hasRecommendactionDictionary = dictionaryService.hasRecommendationDictionary(type);
boolean hasRecommendactionDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
entities.forEach(entity -> {
if (entity.getType().equals(type) || hasRecommendactionDictionary && entity.getType()
@ -156,8 +161,8 @@ public class Section {
// HashSet keeps the older value, but we want the new only.
entities.removeAll(found);
entities.addAll(found);
if (redactEverywhere) {
dictionaryService.addToLocalDictionary(asType, value.trim());
if (redactEverywhere && !isLocal()) {
localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(value.trim());
}
}
}
@ -190,8 +195,8 @@ public class Section {
// HashSet keeps the older value, but we want the new only.
entities.removeAll(found);
entities.addAll(found);
if (redactEverywhere) {
dictionaryService.addToLocalDictionary(asType, line.trim());
if (redactEverywhere && !isLocal()) {
localDictionaryAdds.computeIfAbsent(asType, (x) -> new HashSet<>()).add(line.trim());
}
}
}
@ -300,7 +305,7 @@ public class Section {
entities = removeEntitiesContainedInLarger(entities);
if (addAsRecommendations) {
if (addAsRecommendations && !isLocal()) {
String cleanedWord = word.replaceAll(",", " ").replaceAll(" ", " ").trim() + " ";
Pattern pattern = Patterns.AUTHOR_TABLE_SPITTER;
Matcher matcher = pattern.matcher(cleanedWord);
@ -308,13 +313,11 @@ public class Section {
while (matcher.find()) {
String match = matcher.group().trim();
if (match.length() >= 3) {
if(!dictionaryService.getDictionary(type).getEntries().contains(match) && !dictionaryService.getDictionary(RECOMMENDATION_PREFIX + type).getEntries().contains(match)) {
dictionaryService.addToLocalDictionary(RECOMMENDATION_PREFIX + type, match);
}
localDictionaryAdds.computeIfAbsent(RECOMMENDATION_PREFIX + type, (x) -> new HashSet<>())
.add(match);
String lastname = match.split(" ")[0];
if(!dictionaryService.getDictionary(type).getEntries().contains(lastname) && !dictionaryService.getDictionary(RECOMMENDATION_PREFIX + type).getEntries().contains(lastname)) {
dictionaryService.addToLocalDictionary(RECOMMENDATION_PREFIX + type, lastname);
}
localDictionaryAdds.computeIfAbsent(RECOMMENDATION_PREFIX + type, (x) -> new HashSet<>())
.add(lastname);
}
}
}
@ -322,3 +325,9 @@ public class Section {
}
}

View File

@ -0,0 +1,13 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class SectionSearchableTextPair {
private Section section;
private SearchableText searchableText;
}

View File

@ -11,12 +11,14 @@ import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import feign.FeignException;
@ -29,7 +31,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DictionaryService {
public static final String RECOMMENDATION_PREFIX = "recommendation_";
private final DictionaryClient dictionaryClient;
@ -55,23 +56,6 @@ public class DictionaryService {
private Map<String, DictionaryModel> localAccessMap = new HashMap<>();
public boolean hasLocalEntries() {
return this.dictionary.stream().anyMatch(dm -> !dm.getLocalEntries().isEmpty());
}
public void addToLocalDictionary(String type, String value) {
localAccessMap.get(type).getLocalEntries().add(value);
}
public void clearLocalEntries() {
this.dictionary.forEach(dm -> dm.getLocalEntries().clear());
}
public void updateDictionary() {
@ -112,13 +96,13 @@ public class DictionaryService {
}
public void updateExternalDictionary(){
dictionary.forEach(dm -> {
public void updateExternalDictionary(Dictionary dictionary){
dictionary.getDictionaryModels().forEach(dm -> {
if(dm.isRecommendation() && !dm.getLocalEntries().isEmpty()){
dictionaryClient.addEntries(dm.getType(), new ArrayList<>(dm.getLocalEntries()), false);
long externalVersion = dictionaryClient.getVersion();
if(externalVersion == dictionaryVersion + 1){
dictionaryVersion = externalVersion;
if(externalVersion == dictionary.getVersion() + 1){
dictionary.setVersion(externalVersion);
}
}
});
@ -185,19 +169,14 @@ public class DictionaryService {
}
public boolean hasRecommendationDictionary(String type) {
public Dictionary getDeepCopyDictionary(){
List<DictionaryModel> copy = new ArrayList<>();
DictionaryModel model = localAccessMap.get(RECOMMENDATION_PREFIX + type);
if (model != null) {
return true;
}
return false;
}
dictionary.forEach(dm -> {
copy.add(SerializationUtils.clone(dm));
});
public DictionaryModel getDictionary(String type) {
return localAccessMap.get(type);
return new Dictionary(copy, dictionaryVersion);
}
}

View File

@ -21,11 +21,13 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
@ -43,21 +45,23 @@ public class EntityRedactionService {
dictionaryService.updateDictionary();
droolsExecutionService.updateRules();
dictionaryService.clearLocalEntries();
long rulesVersion = droolsExecutionService.getRulesVersion();
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, manualRedactions, false, null));
Dictionary dictionary = dictionaryService.getDeepCopyDictionary();
if (dictionaryService.hasLocalEntries()) {
Set<Entity> documentEntities = new HashSet<>(findEntities(classifiedDoc, manualRedactions, dictionary, false, null));
if (dictionary.hasLocalEntries()) {
Map<Integer, Set<Entity>> hintsPerSectionNumber = new HashMap<>();
documentEntities.stream().forEach(entity -> {
if (dictionaryService.isHint(entity.getType())) {
if (dictionary.isHint(entity.getType())) {
hintsPerSectionNumber.computeIfAbsent(entity.getSectionNumber(), (x) -> new HashSet<>())
.add(entity);
}
});
Set<Entity> foundByLocal = findEntities(classifiedDoc, manualRedactions, true, hintsPerSectionNumber);
Set<Entity> foundByLocal = findEntities(classifiedDoc, manualRedactions, dictionary, true, hintsPerSectionNumber);
// HashSet keeps the older value, but we want the new only.
documentEntities.removeAll(foundByLocal);
documentEntities.addAll(foundByLocal);
@ -81,15 +85,20 @@ public class EntityRedactionService {
}
}
dictionaryService.updateExternalDictionary();
dictionaryService.updateExternalDictionary(dictionary);
classifiedDoc.setDictionaryVersion(dictionary.getVersion());
classifiedDoc.setRulesVersion(rulesVersion);
}
private Set<Entity> findEntities(Document classifiedDoc, ManualRedactions manualRedactions, boolean localEntries,
private Set<Entity> findEntities(Document classifiedDoc, ManualRedactions manualRedactions,
Dictionary dictionary, boolean local,
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
Set<Entity> documentEntities = new HashSet<>();
int sectionNumber = 1;
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (Paragraph paragraph : classifiedDoc.getParagraphs()) {
SearchableText searchableText = paragraph.getSearchableText();
@ -122,10 +131,11 @@ public class EntityRedactionService {
searchableRow.addAll(textBlock.getSequences());
}
}
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, localEntries);
Set<Entity> rowEntities = findEntities(searchableRow, table.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
Section analysedRowSection = droolsExecutionService.executeRules(Section.builder()
.dictionaryService(dictionaryService)
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
.concat(rowEntities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
.collect(Collectors.toSet()) : rowEntities)
@ -134,18 +144,19 @@ public class EntityRedactionService {
.headline(table.getHeadline())
.sectionNumber(sectionNumber)
.tabularData(tabularData)
.build());
.build(), searchableRow));
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), searchableRow));
sectionNumber++;
}
sectionNumber++;
}
addSectionToManualRedactions(paragraph.getTextBlocks(), manualRedactions, paragraph.getHeadline(), sectionNumber);
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, localEntries);
Section analysedSection = droolsExecutionService.executeRules(Section.builder()
.dictionaryService(dictionaryService)
Set<Entity> entities = findEntities(searchableText, paragraph.getHeadline(), sectionNumber, dictionary.getDictionaryModels(), local);
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(local)
.dictionaryTypes(dictionary.getTypes())
.entities(hintsPerSectionNumber != null && hintsPerSectionNumber.containsKey(sectionNumber) ? Stream
.concat(entities.stream(), hintsPerSectionNumber.get(sectionNumber).stream())
.collect(Collectors.toSet()) : entities)
@ -153,22 +164,43 @@ public class EntityRedactionService {
.searchText(searchableText.toString())
.headline(paragraph.getHeadline())
.sectionNumber(sectionNumber)
.build());
.build(), searchableText));
documentEntities.addAll(clearAndFindPositions(analysedSection.getEntities(), searchableText));
sectionNumber++;
}
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(sectionSearchableTextPair.getSection());
documentEntities.addAll(clearAndFindPositions(analysedRowSection.getEntities(), sectionSearchableTextPair.getSearchableText(), dictionary));
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
if (dictionary.isRecommendation(key)){
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> {
if (!dictionary.containsValue(key, value)){
dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value);
}
});
} else {
analysedRowSection.getLocalDictionaryAdds().get(key).forEach( value -> {
dictionary.getLocalAccessMap().get(key).getLocalEntries().add(value);
});
}
});
});
return documentEntities;
}
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text) {
private Set<Entity> clearAndFindPositions(Set<Entity> entities, SearchableText text, Dictionary dictionary) {
removeEntitiesContainedInLarger(entities);
for (Entity entity : entities) {
if(entity.getPositionSequences().isEmpty()) {
entity.setPositionSequences(text.getSequences(entity.getWord(), dictionaryService.isCaseInsensitiveDictionary(entity
if (entity.getPositionSequences().isEmpty()) {
entity.setPositionSequences(text.getSequences(entity.getWord(), dictionary.isCaseInsensitiveDictionary(entity
.getType()), entity.getTargetSequences()));
}
}
@ -177,7 +209,8 @@ public class EntityRedactionService {
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber, boolean local) {
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
List<DictionaryModel> dictionary, boolean local) {
Set<Entity> found = new HashSet<>();
String searchableString = searchableText.toString();
@ -186,7 +219,7 @@ public class EntityRedactionService {
}
String lowercaseInputString = searchableString.toLowerCase();
for (DictionaryModel model : dictionaryService.getDictionary()) {
for (DictionaryModel model : dictionary) {
if (model.isCaseInsensitive()) {
found.addAll(find(lowercaseInputString, model.getValues(local), model.getType(), headline, sectionNumber));
} else {
@ -231,7 +264,8 @@ public class EntityRedactionService {
for (Entity word : entities) {
for (Entity inner : entities) {
if (inner.getWord().length() < word.getWord()
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word.getSectionNumber() == inner.getSectionNumber()) {
.length() && inner.getStart() >= word.getStart() && inner.getEnd() <= word.getEnd() && word != inner && word
.getSectionNumber() == inner.getSectionNumber()) {
wordsToRemove.add(inner);
}
}

View File

@ -382,7 +382,7 @@ public class RedactionIntegrationTest {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/Fludioxonil/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21.pdf");
RedactionRequest request = RedactionRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))

View File

@ -1,2 +1,10 @@
Long-term
Brown liquid
Brown solid
Hand-held
Manual-Hand held
Manual-Hand held
Weight:
Sprague
Weight and length
Aeration: Gentle