Pull request #126: First steps for incremental analysis

Merge in RED/redaction-service from incrementAnalysis to master

* commit '511092b9e76d58af33dfff90c1133b92e850d47c':
  First steps for incremental analysis
This commit is contained in:
Dominique Eiflaender 2021-03-09 09:12:13 +01:00
commit 9db74628a4
17 changed files with 798 additions and 63 deletions

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class ReanalyzeResult {
private RedactionLog redactionLog;
}

View File

@ -0,0 +1,22 @@
package com.iqser.red.service.redaction.v1.model;
import java.time.OffsetDateTime;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RenalyzeRequest {
private byte[] document;
private String ruleSetId;
private ManualRedactions manualRedactions;
private Text text;
private RedactionLog redactionLog;
private OffsetDateTime lastProcessed;
}

View File

@ -26,4 +26,8 @@ public class SectionArea {
private String header;
public boolean contains(Rectangle other) {
return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight();
}
}

View File

@ -4,8 +4,11 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
@ -21,6 +24,9 @@ public interface RedactionResource {
@PostMapping(value = "/analyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest);
@PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest);
@PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE)
AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest);

View File

@ -20,7 +20,7 @@
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>configuration-service-api-v1</artifactId>
<version>2.0.0</version>
<version>2.2.9</version>
</dependency>
<dependency>
<groupId>org.drools</groupId>

View File

@ -4,10 +4,12 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest;
import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.RedactionLog;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionGrid;
import com.iqser.red.service.redaction.v1.model.Text;
import com.iqser.red.service.redaction.v1.resources.RedactionResource;
@ -18,13 +20,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationSer
import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService;
import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RequestBody;
@ -47,6 +52,7 @@ public class RedactionController implements RedactionResource {
private final DroolsExecutionService droolsExecutionService;
private final DictionaryService dictionaryService;
private final AnnotationService annotationService;
private final ReanalyzeService reanalyzeService;
@Override
@ -68,7 +74,7 @@ public class RedactionController implements RedactionResource {
return AnalyzeResult.builder()
.sectionGrid(classifiedDoc.getSectionGrid())
.redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc
.getRulesVersion(), analyzeRequest.getRuleSetId()))
.getRulesVersion(), analyzeRequest.getRuleSetId()))
.numberOfPages(classifiedDoc.getPages().size())
.text(new Text(classifiedDoc.getSectionText()))
.build();
@ -80,6 +86,12 @@ public class RedactionController implements RedactionResource {
}
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
return reanalyzeService.reanalyze(renalyzeRequest);
}
public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) {
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) {

View File

@ -0,0 +1,83 @@
package com.iqser.red.service.redaction.v1.server.parsing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Setter
private int pageNumber;
public PDFAreaTextStripper() throws IOException {
}
@Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i)
.getUnicode()
.equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0)
.getUnicode()
.equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i + 1;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1)
.getUnicode()
.equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0)
.getUnicode()
.equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
super.writeString(text);
}
public void clearPositions(){
textPositionSequences = new ArrayList<>();
}
}

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class DictionaryIncrement {
private Set<String> values;
private long dictionaryVersion;
}

View File

@ -3,6 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.io.Serializable;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -17,11 +20,12 @@ public class DictionaryModel implements Serializable {
private boolean caseInsensitive;
private boolean hint;
private boolean recommendation;
private Set<String> entries;
private Set<DictionaryEntry> entries;
private Set<String> localEntries;
public Set<String> getValues(boolean local){
return local ? localEntries : entries;
return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors
.toSet());
}
}

View File

@ -0,0 +1,34 @@
package com.iqser.red.service.redaction.v1.server.redaction.model;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ReanalysisSection {
private int sectionNumber;
private String headline;
private List<TextBlock> textBlocks;
private Map<String, CellValue> tabularData = new HashMap<>();
private List<Integer> cellStarts;
public SearchableText getSearchableText() {
SearchableText searchableText = new SearchableText();
textBlocks.forEach(block -> {
if (block instanceof TextBlock) {
searchableText.addAll(block.getSequences());
}
});
return searchableText;
}
}

View File

@ -1,42 +1,45 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import java.awt.Color;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResult;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class DictionaryService {
private final DictionaryClient dictionaryClient;
private Map<String, DictionaryRepresentation> dictionariesByRuleSets = new HashMap<>();
public void updateDictionary(String ruleSetId) {
public long updateDictionary(String ruleSetId) {
long version = dictionaryClient.getVersion(ruleSetId);
@ -45,6 +48,26 @@ public class DictionaryService {
if (foundDictionary == null || version > foundDictionary.getDictionaryVersion()) {
updateDictionaryEntry(ruleSetId, version);
}
return version;
}
public DictionaryIncrement getDictionaryIncrements(String ruleSetId, long fromVersion) {
long version = updateDictionary(ruleSetId);
Set<String> newValues = new HashSet<>();
List<DictionaryModel> dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary();
dictionaryModels.forEach(dictionaryModel -> {
dictionaryModel.getEntries().forEach(dictionaryEntry -> {
if (dictionaryEntry.getVersion() > fromVersion) {
newValues.add(dictionaryEntry.getValue());
}
});
});
return new DictionaryIncrement(newValues, version);
}
@ -63,7 +86,6 @@ public class DictionaryService {
.sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed())
.collect(Collectors.toList());
dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm));
Colors colors = dictionaryClient.getColors(ruleSetId);
@ -86,6 +108,7 @@ public class DictionaryService {
public void updateExternalDictionary(Dictionary dictionary, String ruleSetId) {
dictionary.getDictionaryModels().forEach(dm -> {
if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) {
dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false);
@ -98,17 +121,15 @@ public class DictionaryService {
}
private Set<String> convertEntries(TypeResult t) {
private Set<DictionaryEntry> convertEntries(TypeResult t) {
Set<DictionaryEntry> entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId())
.getEntries());
if (t.isCaseInsensitive()) {
return dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId())
.getEntries()
.stream()
.map(String::toLowerCase)
.collect(Collectors.toSet());
} else {
return new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId()).getEntries());
entries.forEach(entry -> entry.getValue().toLowerCase(Locale.ROOT));
}
return entries;
}
@ -148,6 +169,7 @@ public class DictionaryService {
return false;
}
public boolean isRecommendation(String type, String ruleSetId) {
DictionaryModel model = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type);
@ -159,6 +181,7 @@ public class DictionaryService {
public Dictionary getDeepCopyDictionary(String ruleSetId) {
List<DictionaryModel> copy = new ArrayList<>();
var representation = dictionariesByRuleSets.get(ruleSetId);
@ -170,15 +193,22 @@ public class DictionaryService {
return new Dictionary(copy, representation.getDictionaryVersion());
}
public float[] getRequestRemoveColor(String ruleSetId) {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
}
public float[] getNotRedactedColor(String ruleSetId) {
return dictionariesByRuleSets.get(ruleSetId).getNotRedactedColor();
}
public float[] getRequestAddColor(String ruleSetId) {
return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor();
}
}

View File

@ -349,7 +349,7 @@ public class EntityRedactionService {
}
private Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
public Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
Dictionary dictionary, boolean local) {
Set<Entity> found = new HashSet<>();

View File

@ -0,0 +1,281 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import java.awt.geom.Rectangle2D;
import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.kie.api.runtime.KieContainer;
import org.springframework.stereotype.Service;
import org.springframework.web.bind.annotation.RequestBody;
import com.iqser.red.service.redaction.v1.model.Comment;
import com.iqser.red.service.redaction.v1.model.IdRemoval;
import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class ReanalyzeService {
private final DictionaryService dictionaryService;
private final DroolsExecutionService droolsExecutionService;
private final SurroundingWordsService surroundingWordsService;
private final EntityRedactionService entityRedactionService;
private final RedactionLogCreatorService redactionLogCreatorService;
public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) {
DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest
.getRedactionLog()
.getDictionaryVersion());
Set<String> manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions());
Map<String, List<Comment>> comments = null;
Set<ManualRedactionEntry> manualAdds = null;
if (renalyzeRequest.getManualRedactions() != null) {
// TODO comments will be removed from redactionLog, so we ignore this first.
comments = renalyzeRequest.getManualRedactions().getComments();
manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd();
}
Set<Integer> sectionsToReanaylse = new HashSet<>();
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
sectionsToReanaylse.add(entry.getSectionNumber());
}
}
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
Set<Entity> entities = EntitySearchUtils.find(sectionText.getText(), dictionaryIncrement.getValues(), "find", sectionText
.getHeadline(), sectionText.getSectionNumber(), false);
if (!entities.isEmpty()) {
sectionsToReanaylse.add(sectionText.getSectionNumber());
}
if (manualAdds != null) {
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
for (ManualRedactionEntry manualAdd : manualAdds) {
for (Rectangle manualPosition : manualAdd.getPositions()) {
if (sectionArea.contains(manualPosition)) {
manualAdd.setSection(sectionText.getHeadline());
manualAdd.setSectionNumber(sectionText.getSectionNumber());
}
}
}
}
}
}
if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) {
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
}
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) {
List<ReanalysisSection> reanalysisSections = new ArrayList<>();
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) {
continue;
}
ReanalysisSection reanalysisSection = new ReanalysisSection();
reanalysisSection.setHeadline(sectionText.getHeadline());
reanalysisSection.setSectionNumber(sectionText.getSectionNumber());
List<TextBlock> textBlocks = new ArrayList<>();
Map<Integer, List<SectionArea>> sectionAreasPerPage = new HashMap<>();
for (SectionArea sectionArea : sectionText.getSectionAreas()) {
sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>())
.add(sectionArea);
}
Map<String, CellValue> tabularData = new HashMap<>();
List<Integer> cellStarts = new ArrayList<>();
for (Integer page : sectionAreasPerPage.keySet()) {
List<SectionArea> areasOnPage = sectionAreasPerPage.get(page);
PDPage pdPage = pdDocument.getPage(page - 1);
PDRectangle cropBox = pdPage.getCropBox();
PDFAreaTextStripper textStripper = new PDFAreaTextStripper();
textStripper.setPageNumber(page);
int cellStart = 0;
for (SectionArea sectionArea : areasOnPage) {
Rectangle2D rect = null;
if (pdPage.getRotation() == 90) {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft()
.getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f);
} else {
rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft()
.getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea
.getHeight() + 0.001f);
}
textStripper.addRegion(String.valueOf(1), rect);
textStripper.extractRegions(pdPage);
textStripper.getTextForRegion(String.valueOf(1));
List<TextPositionSequence> positions = textStripper.getTextPositionSequences();
TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft()
.getX() + sectionArea.getWidth(), sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0);
if (sectionText.isTable()) {
Cell cell = new Cell();
cell.addTextBlock(textBlock);
tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart));
cellStarts.add(cellStart);
cellStart = cellStart + cell.toString().trim().length() + 1;
}
textBlocks.add(textBlock);
textStripper.clearPositions();
}
}
reanalysisSection.setTextBlocks(textBlocks);
reanalysisSection.setTabularData(tabularData);
reanalysisSections.add(reanalysisSection);
if (sectionText.isTable()) {
reanalysisSection.setCellStarts(cellStarts);
}
}
//--
KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId());
Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId());
List<SectionSearchableTextPair> sectionSearchableTextPairs = new ArrayList<>();
for (ReanalysisSection reanalysisSection : reanalysisSections) {
Set<Entity> entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection
.getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false);
if (reanalysisSection.getCellStarts() != null) {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection
.getCellStarts());
} else {
surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary);
}
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)
.dictionaryTypes(dictionary.getTypes())
.entities(entities)
.text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks())
.searchText(reanalysisSection.getSearchableText().toString())
.headline(reanalysisSection.getHeadline())
.sectionNumber(reanalysisSection.getSectionNumber())
.tabularData(reanalysisSection.getTabularData())
.searchableText(reanalysisSection.getSearchableText())
.dictionary(dictionary)
.build(), reanalysisSection.getSearchableText()));
}
Set<Entity> entities = new HashSet<>();
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
.getSection());
entities.addAll(analysedRowSection.getEntities());
});
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
for (Entity entity : entities) {
Map<Integer, List<EntityPositionSequence>> sequenceOnPage = new HashMap<>();
for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) {
sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>())
.add(entityPositionSequence);
}
for (Map.Entry<Integer, List<EntityPositionSequence>> entry : sequenceOnPage.entrySet()) {
entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
.getStart(), entity.getEnd()));
}
}
List<RedactionLogEntry> newRedactionLogEntries = new ArrayList<>();
for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) {
if (entitiesPerPage.get(page) != null) {
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
}
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
.getRuleSetId()));
}
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
while (itty.hasNext()) {
RedactionLogEntry entry = itty.next();
if (sectionsToReanaylse.contains(entry.getSectionNumber()) || entry.getSectionNumber() == 0) {
itty.remove();
}
}
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
} catch (Exception e) {
throw new RedactionException(e);
}
}
private Set<String> getForceAndRemoveIds(ManualRedactions manualRedactions) {
if (manualRedactions == null) {
return new HashSet<>();
}
return Stream.concat(manualRedactions.getIdsToRemove()
.stream()
.map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId))
.collect(Collectors.toSet());
}
}

View File

@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ -55,11 +56,11 @@ public class RedactionLogCreatorService {
addSectionGrid(classifiedDoc, page);
if (classifiedDoc.getEntities().get(page) != null) {
addEntries(classifiedDoc, manualRedactions, page, ruleSetId);
classifiedDoc.getRedactionLogEntities().addAll(addEntries(classifiedDoc.getEntities(), manualRedactions, page, ruleSetId));
}
if (manualRedactionPages.contains(page)) {
addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId);
classifiedDoc.getRedactionLogEntities().addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId));
}
if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) {
@ -106,13 +107,15 @@ public class RedactionLogCreatorService {
}
private void addEntries(Document classifiedDoc, ManualRedactions manualRedactions, int page, String ruleSetId) {
public List<RedactionLogEntry> addEntries(Map<Integer, List<Entity>> entities, ManualRedactions manualRedactions, int page, String ruleSetId) {
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
// Duplicates can exist due table extraction colums over multiple rows.
Set<String> processedIds = new HashSet<>();
entityLoop:
for (Entity entity : classifiedDoc.getEntities().get(page)) {
for (Entity entity : entities.get(page)) {
List<Comment> comments = null;
@ -201,10 +204,12 @@ public class RedactionLogCreatorService {
// FIXME ids should never be null. Figure out why this happens.
if (redactionLogEntry.getId() != null) {
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
redactionLogEntities.add(redactionLogEntry);
}
}
}
return redactionLogEntities;
}
@ -233,14 +238,16 @@ public class RedactionLogCreatorService {
}
private void addManualEntries(Document classifiedDoc, ManualRedactions manualRedactions, int page,
public List<RedactionLogEntry> addManualAddEntries(Set<ManualRedactionEntry> manualAdds, Map<String, List<Comment>> comments, int page,
String ruleSetId) {
if (manualRedactions == null) {
return;
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
if (manualAdds == null) {
return redactionLogEntities;
}
for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) {
for (ManualRedactionEntry manualRedactionEntry : manualAdds) {
String id = manualRedactionEntry.getId();
@ -254,11 +261,13 @@ public class RedactionLogCreatorService {
}
}
redactionLogEntry.setComments(manualRedactions.getComments().get(id));
redactionLogEntry.setComments(comments.get(id));
if (!rectanglesOnPage.isEmpty() && !approvedAndShouldBeInDictionary(manualRedactionEntry)) {
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
redactionLogEntities.add(redactionLogEntry);
}
}
return redactionLogEntities;
}

View File

@ -11,10 +11,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizati
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
private List<TextBlock> textBlocks = new ArrayList<>();

View File

@ -4,8 +4,12 @@ import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.when;
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@ -17,6 +21,7 @@ import java.nio.charset.StandardCharsets;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -24,7 +29,15 @@ import java.util.UUID;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.util.Matrix;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.kie.api.KieServices;
@ -32,6 +45,7 @@ import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.mockito.MockitoAnnotations;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.TestConfiguration;
@ -42,6 +56,7 @@ import org.springframework.test.context.junit4.SpringRunner;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
@ -56,17 +71,28 @@ import com.iqser.red.service.redaction.v1.model.ManualForceRedact;
import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.ReanalyzeResult;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
import com.iqser.red.service.redaction.v1.model.SectionArea;
import com.iqser.red.service.redaction.v1.model.SectionText;
import com.iqser.red.service.redaction.v1.model.Status;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = RANDOM_PORT)
@ -112,6 +138,7 @@ public class RedactionIntegrationTest {
private final Map<String, Boolean> recommendationTypeMap = new HashMap<>();
private final Map<String, Integer> rankTypeMap = new HashMap<>();
private final Colors colors = new Colors();
private final Map<String, Long> reanlysisVersions = new HashMap<>();
private final static String TEST_RULESET_ID = "123";
@ -376,7 +403,7 @@ public class RedactionIntegrationTest {
return DictionaryResponse.builder()
.hexColor(typeColorMap.get(type))
.entries(dictionary.get(type))
.entries(toDictionaryEntry(dictionary.get(type)))
.isHint(hintTypeMap.get(type))
.isCaseInsensitive(caseInSensitiveMap.get(type))
.isRecommendation(recommendationTypeMap.get(type))
@ -385,6 +412,15 @@ public class RedactionIntegrationTest {
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, false));
});
return dictionaryEntries;
}
@Test
public void noExceptionShouldBeThrownForAnyFiles() throws IOException {
@ -414,6 +450,22 @@ public class RedactionIntegrationTest {
assertThat(entry.getValue().size()).isEqualTo(1);
});
dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
long rstart = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(new FileInputStream(path)))
.manualRedactions(null)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
}
long end = System.currentTimeMillis();
@ -455,6 +507,86 @@ public class RedactionIntegrationTest {
AnalyzeResult result = redactionController.analyze(request);
long end = System.currentTimeMillis();
System.out.println("first analysis duration: " + (end - start));
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) {
fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText()));
}
int correctFound = 0;
loop:
for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) {
for (SectionText sectionText : result.getText().getSectionTexts()) {
if (redactionLogEntry.getType().equals("image")) {
correctFound++;
continue loop;
}
if (redactionLogEntry.getSectionNumber() == sectionText.getSectionNumber()) {
String value = sectionText.getText()
.substring(redactionLogEntry.getStartOffset(), redactionLogEntry.getEndOffset());
if (redactionLogEntry.getValue().equalsIgnoreCase(value)) {
correctFound++;
} else {
throw new RuntimeException("WTF");
}
}
}
}
assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size());
dictionary.get(AUTHOR).add("properties");
reanlysisVersions.put("properties", 1L);
dictionary.get(AUTHOR).add("physical");
reanlysisVersions.put("physical", 2L);
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(2L);
when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR));
start = System.currentTimeMillis();
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(null)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
end = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (end - start));
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
}
@Test
@Ignore
public void fillRecanTest() throws IOException {
System.out.println("redactionTest");
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/S5.pdf");
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.build();
AnalyzeResult result = redactionController.analyze(request);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
@ -496,9 +628,70 @@ public class RedactionIntegrationTest {
System.out.println("duration: " + (end - start));
System.out.println("numberOfPages: " + result.getNumberOfPages());
SectionArea sectionArea = result.getText().getSectionTexts().get(3).getSectionAreas().get(5);
try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(IOUtils.toByteArray(pdfFileResource.getInputStream())))) {
PDPage docPage = pdDocument.getPage(0);
PDFTextStripperByArea textStripper = new PDFTextStripperByArea();
PDRectangle cropBox = docPage.getCropBox();
PDRectangle mediaBox = docPage.getMediaBox();
// if (textPositions.get(0).getRotation() == 90) {
// posXEnd = textPositions.get(0).getYDirAdj() + 2;
// posYInit = getY1();
// posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4;
// } else {
// posXEnd = textPositions.get(textPositions.size() - 1)
// .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1;
// posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2;
// posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1)
// .getYDirAdj() + 2;
// }
Rectangle2D rect = new Rectangle2D.Float(sectionArea.getTopLeft()
.getY(), sectionArea.getTopLeft()
.getX() , sectionArea.getHeight(), sectionArea
.getWidth() + 0.001f);
textStripper.addRegion("region", rect);
textStripper.extractRegions(docPage);
String textForRegion = textStripper.getTextForRegion("region");
System.out.println(textForRegion);
// fill a rectangle
PDPageContentStream contents = new PDPageContentStream (pdDocument, docPage, PDPageContentStream.AppendMode.APPEND, false, false);
contents.setNonStrokingColor (Color.RED);
contents.addRect (sectionArea.getTopLeft().getX(), sectionArea.getTopLeft().getY(), sectionArea.getWidth(), sectionArea.getHeight());
contents.fill ();
contents.close ();
try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
pdDocument.save(byteArrayOutputStream);
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated2.pdf")) {
fileOutputStream.write(byteArrayOutputStream.toByteArray());
}
}
} catch (Exception e) {
throw new RedactionException(e);
}
}
@Test
public void testTableRedaction() throws IOException {
@ -569,7 +762,7 @@ public class RedactionIntegrationTest {
manualRedactionEntry.setReason("Manual Redaction");
manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1)));
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
// manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
AnalyzeRequest request = AnalyzeRequest.builder()
.ruleSetId(TEST_RULESET_ID)
@ -579,9 +772,25 @@ public class RedactionIntegrationTest {
AnalyzeResult result = redactionController.analyze(request);
manualRedactions.getEntriesToAdd().add(manualRedactionEntry);
manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder()
.id("5b940b2cb401ed9f5be6fc24f6e77bcf")
.status(Status.APPROVED)
.build()));
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
.redactionLog(result.getRedactionLog())
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.manualRedactions(manualRedactions)
.text(result.getText())
.ruleSetId(TEST_RULESET_ID)
.build());
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
.redactionLog(result.getRedactionLog())
.redactionLog(reanalyzeResult.getRedactionLog())
.sectionGrid(result.getSectionGrid())
.build());

View File

@ -1,6 +1,7 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.configuration.v1.api.model.Colors;
import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry;
import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse;
import com.iqser.red.service.configuration.v1.api.model.RulesResponse;
import com.iqser.red.service.configuration.v1.api.model.TypeResponse;
@ -129,12 +130,12 @@ public class EntityRedactionServiceTest {
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
@ -162,12 +163,12 @@ public class EntityRedactionServiceTest {
.build();
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))
.entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "OLoughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))
.entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
@ -191,11 +192,11 @@ public class EntityRedactionServiceTest {
" Supplement - Identity of the active substance - Reference list.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
@ -228,15 +229,15 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
@ -297,11 +298,11 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
DictionaryResponse sponsorResponse = DictionaryResponse.builder()
@ -346,7 +347,7 @@ public class EntityRedactionServiceTest {
.build();
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(authorResponse);
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))
.entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))))
.build();
when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
@ -367,13 +368,13 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Bissig R.", "Thanei P."))
.entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
@ -392,13 +393,13 @@ public class EntityRedactionServiceTest {
pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf");
dictionaryResponse = DictionaryResponse.builder()
.entries(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C."))
.entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
@ -419,13 +420,13 @@ public class EntityRedactionServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf");
DictionaryResponse dictionaryResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Aldershof S."))
.entries(toDictionaryEntry(Collections.singletonList("Aldershof S.")))
.build();
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet());
when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse);
DictionaryResponse addressResponse = DictionaryResponse.builder()
.entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))
.entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")))
.build();
when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse);
@ -517,4 +518,12 @@ public class EntityRedactionServiceTest {
}
}
private List<DictionaryEntry> toDictionaryEntry(List<String> entries){
List<DictionaryEntry> dictionaryEntries = new ArrayList<>();
entries.forEach(entry -> {
dictionaryEntries.add(new DictionaryEntry(entry, 1L, false));
});
return dictionaryEntries;
}
}