Pull request #135: RED-1260: Enabled to add rules and manuel redaction actions for images
Merge in RED/redaction-service from RED-1260 to master * commit '55ba351362785de41090fb4252c9ee7c4c486991': RED-1260: Enabled to add rules and manuel redaction actions for images
This commit is contained in:
commit
a7aa3a723a
@ -4,11 +4,13 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -24,8 +26,8 @@ public class Document {
|
||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
private Map<Integer, List<Entity>> entities = new HashMap<>();
|
||||
private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter= new StringFrequencyCounter();
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private boolean headlines;
|
||||
|
||||
@ -35,4 +37,7 @@ public class Document {
|
||||
private long rulesVersion;
|
||||
|
||||
private List<SectionText> sectionText = new ArrayList<>();
|
||||
|
||||
private Map<Integer, Set<Image>> images = new HashMap<>();
|
||||
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.classification.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -12,9 +13,10 @@ import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class Paragraph {
|
||||
public class Paragraph implements Comparable{
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<PdfImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
@ -53,4 +55,11 @@ public class Paragraph {
|
||||
return textBlocks;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@ -67,8 +67,8 @@ public class RedactionController implements RedactionResource {
|
||||
|
||||
log.info("Document structure analysis successful, starting redaction analysis...");
|
||||
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
imageClassificationService.classifyImages(classifiedDoc);
|
||||
entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions());
|
||||
redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest
|
||||
.getRuleSetId());
|
||||
|
||||
|
||||
@ -223,7 +223,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getWidth(), (float) imageBounds.getHeight());
|
||||
|
||||
if (rect.getHeight() > 2 && rect.getWidth() > 2) {
|
||||
this.images.add(new PdfImage(pdfImage.getImage(), rect));
|
||||
this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Image {
|
||||
|
||||
private String type;
|
||||
private Rectangle2D position;
|
||||
private boolean redaction;
|
||||
private String redactionReason;
|
||||
private String legalBasis;
|
||||
private int matchedRule;
|
||||
private int sectionNumber;
|
||||
private String section;
|
||||
private int page;
|
||||
|
||||
}
|
||||
@ -20,5 +20,9 @@ public class PdfImage {
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToParagraph;
|
||||
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
@ -1,8 +1,10 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
|
||||
@ -18,6 +20,7 @@ public class ReanalysisSection {
|
||||
private List<TextBlock> textBlocks;
|
||||
private Map<String, CellValue> tabularData = new HashMap<>();
|
||||
private List<Integer> cellStarts;
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public SearchableText getSearchableText() {
|
||||
|
||||
@ -51,6 +51,9 @@ public class Section {
|
||||
|
||||
private SearchableText searchableText;
|
||||
|
||||
@Builder.Default
|
||||
private Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public boolean rowEquals(String headerName, String value) {
|
||||
|
||||
@ -75,6 +78,12 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesImageType(String type) {
|
||||
|
||||
return images.stream().anyMatch(image -> image.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
public boolean headlineContainsWord(String word) {
|
||||
|
||||
return StringUtils.containsIgnoreCase(headline, word);
|
||||
@ -109,6 +118,19 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void redactImage(String type, int ruleNumber, String reason, String legalBasis) {
|
||||
|
||||
images.forEach(image -> {
|
||||
if (image.getType().equals(type)) {
|
||||
image.setRedaction(true);
|
||||
image.setMatchedRule(ruleNumber);
|
||||
image.setRedactionReason(reason);
|
||||
image.setLegalBasis(legalBasis);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public void redact(String type, int ruleNumber, String reason, String legalBasis) {
|
||||
|
||||
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
|
||||
@ -125,6 +147,18 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void redactNotImage(String type, int ruleNumber, String reason) {
|
||||
|
||||
images.forEach(image -> {
|
||||
if (image.getType().equals(type)) {
|
||||
image.setRedaction(false);
|
||||
image.setMatchedRule(ruleNumber);
|
||||
image.setRedactionReason(reason);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public void redactNot(String type, int ruleNumber, String reason) {
|
||||
|
||||
boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type);
|
||||
@ -140,7 +174,8 @@ public class Section {
|
||||
}
|
||||
|
||||
|
||||
public void expandToHintAnnotationByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String asType) {
|
||||
public void expandToHintAnnotationByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group,
|
||||
String asType) {
|
||||
|
||||
Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive);
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@ -19,22 +20,25 @@ import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.ManualRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -89,7 +93,8 @@ public class EntityRedactionService {
|
||||
.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>())
|
||||
.add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry
|
||||
.getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity.getStart(), entity.getEnd()));
|
||||
.getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity
|
||||
.getStart(), entity.getEnd()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -120,22 +125,24 @@ public class EntityRedactionService {
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph
|
||||
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
.getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, paragraph
|
||||
.getImages()));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
for (Header header : classifiedDoc.getHeaders()) {
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
for (Footer footer : classifiedDoc.getFooters()) {
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) {
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber));
|
||||
sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText
|
||||
.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>()));
|
||||
sectionNumber.incrementAndGet();
|
||||
}
|
||||
|
||||
@ -143,6 +150,10 @@ public class EntityRedactionService {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection());
|
||||
documentEntities.addAll(analysedRowSection.getEntities());
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
classifiedDoc.getImages().computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> {
|
||||
if (dictionary.isRecommendation(key)) {
|
||||
analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> {
|
||||
@ -172,7 +183,8 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private List<SectionSearchableTextPair> processTablePerRow(Document classifiedDoc, Table table, ManualRedactions manualRedactions,
|
||||
private List<SectionSearchableTextPair> processTablePerRow(Document classifiedDoc, Table table,
|
||||
ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary,
|
||||
boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
@ -192,7 +204,11 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks().get(0).getSequences().get(0).getPage());
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
|
||||
addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue());
|
||||
@ -237,7 +253,7 @@ public class EntityRedactionService {
|
||||
.dictionary(dictionary)
|
||||
.build(), searchableRow));
|
||||
|
||||
if(!local) {
|
||||
if (!local) {
|
||||
sectionText.setText(searchableRow.toString());
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
@ -252,7 +268,8 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private List<SectionSearchableTextPair> processTableAsOneText(Document classifiedDoc, Table table, ManualRedactions manualRedactions,
|
||||
private List<SectionSearchableTextPair> processTableAsOneText(Document classifiedDoc, Table table,
|
||||
ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary,
|
||||
boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
@ -266,9 +283,13 @@ public class EntityRedactionService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!local) {
|
||||
if (!local) {
|
||||
SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks().get(0).getSequences().get(0).getPage());
|
||||
.getWidth(), (float) cell.getHeight(), cell.getTextBlocks()
|
||||
.get(0)
|
||||
.getSequences()
|
||||
.get(0)
|
||||
.getPage());
|
||||
sectionText.getSectionAreas().add(sectionArea);
|
||||
}
|
||||
|
||||
@ -279,7 +300,6 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Set<Entity> rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local);
|
||||
surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary);
|
||||
|
||||
@ -297,7 +317,7 @@ public class EntityRedactionService {
|
||||
.dictionary(dictionary)
|
||||
.build(), entireTableText));
|
||||
|
||||
if(!local) {
|
||||
if (!local) {
|
||||
sectionText.setText(entireTableText.toString());
|
||||
sectionText.setHeadline(table.getHeadline());
|
||||
sectionText.setSectionNumber(sectionNumber.intValue());
|
||||
@ -309,12 +329,14 @@ public class EntityRedactionService {
|
||||
}
|
||||
|
||||
|
||||
private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText, List<TextBlock> paragraphTextBlocks,
|
||||
String headline, ManualRedactions manualRedactions,
|
||||
AtomicInteger sectionNumber, Dictionary dictionary, boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber) {
|
||||
private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText,
|
||||
List<TextBlock> paragraphTextBlocks, String headline,
|
||||
ManualRedactions manualRedactions, AtomicInteger sectionNumber,
|
||||
Dictionary dictionary, boolean local,
|
||||
Map<Integer, Set<Entity>> hintsPerSectionNumber,
|
||||
List<PdfImage> images) {
|
||||
|
||||
if(!local) {
|
||||
if (!local) {
|
||||
SectionText sectionText = new SectionText();
|
||||
for (TextBlock paragraphTextBlock : paragraphTextBlocks) {
|
||||
SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock
|
||||
@ -345,12 +367,15 @@ public class EntityRedactionService {
|
||||
.sectionNumber(sectionNumber.intValue())
|
||||
.searchableText(searchableText)
|
||||
.dictionary(dictionary)
|
||||
.images(images.stream()
|
||||
.map(image -> convert(image, sectionNumber.intValue(), headline))
|
||||
.collect(Collectors.toSet()))
|
||||
.build(), searchableText);
|
||||
}
|
||||
|
||||
|
||||
public Set<Entity> findEntities(SearchableText searchableText, String headline, int sectionNumber,
|
||||
Dictionary dictionary, boolean local) {
|
||||
Dictionary dictionary, boolean local) {
|
||||
|
||||
Set<Entity> found = new HashSet<>();
|
||||
String searchableString = searchableText.toString();
|
||||
@ -390,4 +415,18 @@ public class EntityRedactionService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Image convert(PdfImage pdfImage, int sectionNumber, String headline) {
|
||||
|
||||
return Image.builder()
|
||||
.type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType()
|
||||
.name()
|
||||
.toLowerCase(Locale.ROOT))
|
||||
.position(pdfImage.getPosition())
|
||||
.sectionNumber(sectionNumber)
|
||||
.section(headline)
|
||||
.page(pdfImage.getPage())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -32,7 +32,7 @@ public class ImageClassificationService {
|
||||
classifiedDoc.getPages().forEach(page -> {
|
||||
page.getImages().forEach(image -> {
|
||||
|
||||
if(settings.isEnableImageClassification()) {
|
||||
if (settings.isEnableImageClassification()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos
|
||||
|
||||
@ -39,6 +39,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair;
|
||||
@ -75,10 +76,14 @@ public class ReanalyzeService {
|
||||
}
|
||||
|
||||
Set<Integer> sectionsToReanaylse = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imageEntries = new HashMap<>();
|
||||
for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) {
|
||||
if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) {
|
||||
sectionsToReanaylse.add(entry.getSectionNumber());
|
||||
}
|
||||
if (entry.isImage() || entry.getType().equals("image")) {
|
||||
imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry));
|
||||
}
|
||||
}
|
||||
|
||||
for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) {
|
||||
@ -173,10 +178,15 @@ public class ReanalyzeService {
|
||||
}
|
||||
reanalysisSection.setTextBlocks(textBlocks);
|
||||
reanalysisSection.setTabularData(tabularData);
|
||||
reanalysisSections.add(reanalysisSection);
|
||||
|
||||
if (sectionText.isTable()) {
|
||||
reanalysisSection.setCellStarts(cellStarts);
|
||||
}
|
||||
if (imageEntries.containsKey(sectionText.getSectionNumber())) {
|
||||
reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber()));
|
||||
}
|
||||
|
||||
reanalysisSections.add(reanalysisSection);
|
||||
}
|
||||
|
||||
//--
|
||||
@ -208,14 +218,22 @@ public class ReanalyzeService {
|
||||
.tabularData(reanalysisSection.getTabularData())
|
||||
.searchableText(reanalysisSection.getSearchableText())
|
||||
.dictionary(dictionary)
|
||||
.images(reanalysisSection.getImages())
|
||||
.build(), reanalysisSection.getSearchableText()));
|
||||
}
|
||||
|
||||
Set<Entity> entities = new HashSet<>();
|
||||
Map<Integer, Set<Image>> imagesPerPage = new HashMap<>();
|
||||
sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> {
|
||||
Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair
|
||||
.getSection());
|
||||
entities.addAll(analysedRowSection.getEntities());
|
||||
EntitySearchUtils.removeEntitiesContainedInLarger(entities);
|
||||
|
||||
for (Image image : analysedRowSection.getImages()) {
|
||||
imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image);
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Map<Integer, List<Entity>> entitiesPerPage = new HashMap<>();
|
||||
@ -241,6 +259,12 @@ public class ReanalyzeService {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
if (imagesPerPage.get(page) != null) {
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest
|
||||
.getManualRedactions(), page, renalyzeRequest.getRuleSetId()));
|
||||
}
|
||||
|
||||
newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest
|
||||
.getRuleSetId()));
|
||||
}
|
||||
@ -248,12 +272,13 @@ public class ReanalyzeService {
|
||||
Iterator<RedactionLogEntry> itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator();
|
||||
while (itty.hasNext()) {
|
||||
RedactionLogEntry entry = itty.next();
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()) {
|
||||
if (sectionsToReanaylse.contains(entry.getSectionNumber())) {
|
||||
itty.remove();
|
||||
}
|
||||
}
|
||||
|
||||
renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries);
|
||||
|
||||
renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion());
|
||||
|
||||
return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build();
|
||||
@ -277,4 +302,19 @@ public class ReanalyzeService {
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
public Image convert(RedactionLogEntry entry) {
|
||||
|
||||
Rectangle position = entry.getPositions().get(0);
|
||||
|
||||
return Image.builder()
|
||||
.type(entry.getType())
|
||||
.position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft()
|
||||
.getY(), position.getWidth(), position.getHeight()))
|
||||
.sectionNumber(entry.getSectionNumber())
|
||||
.section(entry.getSection())
|
||||
.page(position.getPage())
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,7 +3,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -30,8 +29,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Entity;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
@ -65,47 +63,101 @@ public class RedactionLogCreatorService {
|
||||
.addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId));
|
||||
}
|
||||
|
||||
if (!classifiedDoc.getPages().get(page - 1).getImages().isEmpty()) {
|
||||
addImageEntries(classifiedDoc, page, ruleSetId);
|
||||
if (classifiedDoc.getImages().get(page) != null && !classifiedDoc.getImages().get(page).isEmpty()) {
|
||||
classifiedDoc.getRedactionLogEntities()
|
||||
.addAll(addImageEntries(classifiedDoc.getImages(), manualRedactions, page, ruleSetId));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) {
|
||||
public List<RedactionLogEntry> addImageEntries(Map<Integer, Set<Image>> images, ManualRedactions manualRedactions,
|
||||
int pageNumber, String ruleSetId) {
|
||||
|
||||
List<RedactionLogEntry> redactionLogEntities = new ArrayList<>();
|
||||
|
||||
for (Image image : images.get(pageNumber)) {
|
||||
|
||||
String id = IdBuilder.buildId(image.getPosition(), pageNumber);
|
||||
|
||||
for (PdfImage image : classifiedDoc.getPages().get(pageNumber - 1).getImages()) {
|
||||
RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder()
|
||||
.id(IdBuilder.buildId(image.getPosition(), pageNumber))
|
||||
.color(getColor(image.getImageType().name().toLowerCase(Locale.ROOT), ruleSetId))
|
||||
.id(id)
|
||||
.color(getColorForImage(image, ruleSetId, false))
|
||||
.isImage(true)
|
||||
.type(image.getImageType().equals(ImageType.OTHER) ? "image" : image.getImageType().name().toLowerCase(Locale.ROOT))
|
||||
.redacted(isImageRedactionType(image.getImageType()))
|
||||
.isHint(!isImageRedactionType(image.getImageType()))
|
||||
.type(image.getType())
|
||||
.redacted(image.isRedaction())
|
||||
.reason(image.getRedactionReason())
|
||||
.legalBasis(image.getLegalBasis())
|
||||
.matchedRule(image.getMatchedRule())
|
||||
.isHint(dictionaryService.isHint(image.getType(), ruleSetId))
|
||||
.manual(false)
|
||||
.isDictionaryEntry(false)
|
||||
.isRecommendation(false)
|
||||
.positions(List.of(new Rectangle(new Point((float) image.getPosition()
|
||||
.getX(), (float) image.getPosition().getY()), (float) image.getPosition()
|
||||
.getWidth(), (float) image.getPosition().getHeight(), pageNumber)))
|
||||
.sectionNumber(image.getSectionNumber())
|
||||
.section(image.getSection())
|
||||
.build();
|
||||
classifiedDoc.getRedactionLogEntities().add(redactionLogEntry);
|
||||
}
|
||||
}
|
||||
|
||||
if (manualRedactions != null && !manualRedactions.getIdsToRemove().isEmpty()) {
|
||||
for (IdRemoval manualRemoval : manualRedactions.getIdsToRemove()) {
|
||||
if (manualRemoval.getId().equals(id)) {
|
||||
String manualOverrideReason = null;
|
||||
if (manualRemoval.getStatus().equals(Status.APPROVED)) {
|
||||
image.setRedaction(false);
|
||||
redactionLogEntry.setRedacted(false);
|
||||
redactionLogEntry.setStatus(Status.APPROVED);
|
||||
manualOverrideReason = image.getRedactionReason() + ", removed by manual override";
|
||||
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false));
|
||||
} else if (manualRemoval.getStatus().equals(Status.REQUESTED)) {
|
||||
manualOverrideReason = image.getRedactionReason() + ", requested to remove";
|
||||
redactionLogEntry.setStatus(Status.REQUESTED);
|
||||
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true));
|
||||
} else {
|
||||
redactionLogEntry.setStatus(Status.DECLINED);
|
||||
}
|
||||
|
||||
private boolean isImageRedactionType(ImageType imageType) {
|
||||
image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason());
|
||||
redactionLogEntry.setReason(manualOverrideReason);
|
||||
redactionLogEntry.setManual(true);
|
||||
redactionLogEntry.setManualRedactionType(ManualRedactionType.REMOVE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (imageType.equals(ImageType.LOGO)) {
|
||||
return true;
|
||||
if (manualRedactions != null && !manualRedactions.getForceRedacts().isEmpty()) {
|
||||
for (ManualForceRedact manualForceRedact : manualRedactions.getForceRedacts()) {
|
||||
if (manualForceRedact.getId().equals(id)) {
|
||||
String manualOverrideReason = null;
|
||||
if (manualForceRedact.getStatus().equals(Status.APPROVED)) {
|
||||
image.setRedaction(true);
|
||||
redactionLogEntry.setRedacted(true);
|
||||
redactionLogEntry.setStatus(Status.APPROVED);
|
||||
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false));
|
||||
manualOverrideReason = image.getRedactionReason() + ", forced by manual override";
|
||||
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
|
||||
} else if (manualForceRedact.getStatus().equals(Status.REQUESTED)) {
|
||||
manualOverrideReason = image.getRedactionReason() + ", requested to force redact";
|
||||
redactionLogEntry.setStatus(Status.REQUESTED);
|
||||
redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true));
|
||||
redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis());
|
||||
} else {
|
||||
redactionLogEntry.setStatus(Status.DECLINED);
|
||||
}
|
||||
|
||||
image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason());
|
||||
redactionLogEntry.setReason(manualOverrideReason);
|
||||
redactionLogEntry.setManual(true);
|
||||
redactionLogEntry.setManualRedactionType(ManualRedactionType.FORCE_REDACT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
redactionLogEntities.add(redactionLogEntry);
|
||||
}
|
||||
if (imageType.equals(ImageType.FORMULA)) {
|
||||
return true;
|
||||
}
|
||||
if (imageType.equals(ImageType.SIGNATURE)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
return redactionLogEntities;
|
||||
}
|
||||
|
||||
|
||||
@ -372,6 +424,18 @@ public class RedactionLogCreatorService {
|
||||
}
|
||||
|
||||
|
||||
private float[] getColorForImage(Image image, String ruleSetId, boolean requestedToRemove) {
|
||||
|
||||
if (requestedToRemove) {
|
||||
return dictionaryService.getRequestRemoveColor(ruleSetId);
|
||||
}
|
||||
if (!image.isRedaction() && !dictionaryService.isHint(image.getType(), ruleSetId)) {
|
||||
return dictionaryService.getNotRedactedColor(ruleSetId);
|
||||
}
|
||||
return dictionaryService.getColor(image.getType(), ruleSetId);
|
||||
}
|
||||
|
||||
|
||||
private boolean isHint(Entity entity, String ruleSetId) {
|
||||
|
||||
return dictionaryService.isHint(entity.getType(), ruleSetId);
|
||||
|
||||
@ -71,6 +71,7 @@ public class PdfSegmentationService {
|
||||
|
||||
page.setPageNumber(pageNumber);
|
||||
increaseDocumentStatistics(page, document);
|
||||
|
||||
page.setImages(stripper.getImages());
|
||||
pages.add(page);
|
||||
}
|
||||
@ -78,8 +79,8 @@ public class PdfSegmentationService {
|
||||
document.setPages(pages);
|
||||
|
||||
classificationService.classifyDocument(document);
|
||||
|
||||
sectionsBuilderService.buildSections(document);
|
||||
sectionsBuilderService.addImagesToSections(document);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
@ -2,8 +2,12 @@ package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
@ -16,6 +20,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
@ -85,13 +90,13 @@ public class SectionsBuilderService {
|
||||
prev = current;
|
||||
}
|
||||
|
||||
if(!header.isEmpty()) {
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new Header(header));
|
||||
}
|
||||
if(!footer.isEmpty()) {
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new Footer(footer));
|
||||
}
|
||||
if(!unclassifiedText.isEmpty()) {
|
||||
if (!unclassifiedText.isEmpty()) {
|
||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
||||
}
|
||||
}
|
||||
@ -107,6 +112,53 @@ public class SectionsBuilderService {
|
||||
}
|
||||
|
||||
|
||||
public void addImagesToSections(Document document) {
|
||||
|
||||
Map<Integer, SortedSet<Paragraph>> paragraphMap = new HashMap<>();
|
||||
for (Paragraph paragraph : document.getParagraphs()) {
|
||||
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
|
||||
paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph);
|
||||
}
|
||||
}
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
||||
if (paragraphsOnPage == null) {
|
||||
int i = page.getPageNumber();
|
||||
while (paragraphsOnPage == null) {
|
||||
paragraphsOnPage = paragraphMap.get(i);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
Float perviousEnd = 0f;
|
||||
for (Paragraph paragraph : paragraphsOnPage) {
|
||||
Float currentEnd = 0f;
|
||||
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
if (abs.getMaxY() > currentEnd) {
|
||||
currentEnd = abs.getMaxY();
|
||||
}
|
||||
}
|
||||
|
||||
if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) {
|
||||
paragraph.getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
}
|
||||
perviousEnd = currentEnd;
|
||||
}
|
||||
if (!image.isAppendedToParagraph()) {
|
||||
paragraphsOnPage.first().getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(Table currentTable, Table previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
|
||||
@ -4,12 +4,8 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
@ -21,7 +17,6 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
@ -29,15 +24,7 @@ import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.junit.Before;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.KieServices;
|
||||
@ -45,7 +32,6 @@ import org.kie.api.builder.KieBuilder;
|
||||
import org.kie.api.builder.KieFileSystem;
|
||||
import org.kie.api.builder.KieModule;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.TestConfiguration;
|
||||
@ -77,23 +63,14 @@ import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
import com.iqser.red.service.redaction.v1.model.RenalyzeRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionArea;
|
||||
import com.iqser.red.service.redaction.v1.model.SectionText;
|
||||
import com.iqser.red.service.redaction.v1.model.Status;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
|
||||
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
|
||||
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.exception.RedactionException;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = RANDOM_PORT)
|
||||
@ -112,6 +89,10 @@ public class RedactionIntegrationTest {
|
||||
private static final String TEST_METHOD = "test_method";
|
||||
private static final String PURITY = "purity";
|
||||
private static final String IMAGE = "image";
|
||||
private static final String LOGO = "logo";
|
||||
private static final String SIGNATURE = "signature";
|
||||
private static final String FORMULA = "formula";
|
||||
private static final String OCR = "ocr";
|
||||
|
||||
private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author";
|
||||
private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address";
|
||||
@ -196,6 +177,10 @@ public class RedactionIntegrationTest {
|
||||
when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE));
|
||||
when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY));
|
||||
when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE));
|
||||
when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(OCR));
|
||||
when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(LOGO));
|
||||
when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SIGNATURE));
|
||||
when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FORMULA));
|
||||
when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors);
|
||||
}
|
||||
|
||||
@ -278,7 +263,27 @@ public class RedactionIntegrationTest {
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(IMAGE, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/image.txt")
|
||||
.addAll(ResourceLoader.load("dictionaries/empty.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(OCR, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/empty.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(LOGO, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/empty.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(SIGNATURE, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/empty.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
dictionary.computeIfAbsent(FORMULA, v -> new ArrayList<>())
|
||||
.addAll(ResourceLoader.load("dictionaries/empty.txt")
|
||||
.stream()
|
||||
.map(this::cleanDictionaryEntry)
|
||||
.collect(Collectors.toSet()));
|
||||
@ -309,6 +314,10 @@ public class RedactionIntegrationTest {
|
||||
typeColorMap.put(FALSE_POSITIVE, "#ffffff");
|
||||
typeColorMap.put(PURITY, "#ffe187");
|
||||
typeColorMap.put(IMAGE, "#fcc5fb");
|
||||
typeColorMap.put(OCR, "#fcc5fb");
|
||||
typeColorMap.put(LOGO, "#ffe187");
|
||||
typeColorMap.put(FORMULA, "#ffe187");
|
||||
typeColorMap.put(SIGNATURE, "#ffe187");
|
||||
|
||||
hintTypeMap.put(VERTEBRATE, true);
|
||||
hintTypeMap.put(ADDRESS, false);
|
||||
@ -326,6 +335,10 @@ public class RedactionIntegrationTest {
|
||||
hintTypeMap.put(FALSE_POSITIVE, true);
|
||||
hintTypeMap.put(PURITY, false);
|
||||
hintTypeMap.put(IMAGE, true);
|
||||
hintTypeMap.put(OCR, true);
|
||||
hintTypeMap.put(FORMULA, false);
|
||||
hintTypeMap.put(LOGO, false);
|
||||
hintTypeMap.put(SIGNATURE, false);
|
||||
|
||||
caseInSensitiveMap.put(VERTEBRATE, true);
|
||||
caseInSensitiveMap.put(ADDRESS, false);
|
||||
@ -343,6 +356,10 @@ public class RedactionIntegrationTest {
|
||||
caseInSensitiveMap.put(FALSE_POSITIVE, false);
|
||||
caseInSensitiveMap.put(PURITY, false);
|
||||
caseInSensitiveMap.put(IMAGE, true);
|
||||
caseInSensitiveMap.put(OCR, true);
|
||||
caseInSensitiveMap.put(SIGNATURE, true);
|
||||
caseInSensitiveMap.put(LOGO, true);
|
||||
caseInSensitiveMap.put(FORMULA, true);
|
||||
|
||||
recommendationTypeMap.put(VERTEBRATE, false);
|
||||
recommendationTypeMap.put(ADDRESS, false);
|
||||
@ -360,6 +377,10 @@ public class RedactionIntegrationTest {
|
||||
recommendationTypeMap.put(FALSE_POSITIVE, false);
|
||||
recommendationTypeMap.put(PURITY, false);
|
||||
recommendationTypeMap.put(IMAGE, false);
|
||||
recommendationTypeMap.put(OCR, false);
|
||||
recommendationTypeMap.put(FORMULA, false);
|
||||
recommendationTypeMap.put(SIGNATURE, false);
|
||||
recommendationTypeMap.put(LOGO, false);
|
||||
|
||||
rankTypeMap.put(FALSE_POSITIVE, 160);
|
||||
rankTypeMap.put(PURITY, 155);
|
||||
@ -377,6 +398,10 @@ public class RedactionIntegrationTest {
|
||||
rankTypeMap.put(RECOMMENDATION_AUTHOR, 40);
|
||||
rankTypeMap.put(RECOMMENDATION_ADDRESS, 30);
|
||||
rankTypeMap.put(IMAGE, 30);
|
||||
rankTypeMap.put(OCR, 29);
|
||||
rankTypeMap.put(LOGO, 28);
|
||||
rankTypeMap.put(SIGNATURE, 27);
|
||||
rankTypeMap.put(FORMULA, 26);
|
||||
|
||||
colors.setDefaultColor("#acfc00");
|
||||
colors.setNotRedacted("#cccccc");
|
||||
@ -563,7 +588,6 @@ public class RedactionIntegrationTest {
|
||||
ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder()
|
||||
.redactionLog(result.getRedactionLog())
|
||||
.document(IOUtils.toByteArray(pdfFileResource.getInputStream()))
|
||||
.manualRedactions(null)
|
||||
.text(result.getText())
|
||||
.ruleSetId(TEST_RULESET_ID)
|
||||
.build());
|
||||
|
||||
@ -2,12 +2,17 @@ package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
@ -18,7 +23,10 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
|
||||
@ -45,24 +53,25 @@ public class PdfSegmentationServiceTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testPDFSegmentationWithComplexTable() throws IOException {
|
||||
@Ignore
|
||||
public void testExtractImages() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
|
||||
|
||||
try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) {
|
||||
Document document = pdfSegmentationService.parseDocument(pdDocument);
|
||||
assertThat(document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
Table table = document.getParagraphs()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables().stream())
|
||||
.collect(Collectors.toList())
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
int i = 0;
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
|
||||
ImageIO.write(image.getImage(), "png", baos);
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) {
|
||||
fileOutputStream.write(baos.toByteArray());
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -97,12 +106,12 @@ public class PdfSegmentationServiceTest {
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,12 +146,12 @@ public class PdfSegmentationServiceTest {
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
@ -177,12 +186,12 @@ public class PdfSegmentationServiceTest {
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream()
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList())
|
||||
.equals(firstTableHeaderCells)))
|
||||
.isTrue();
|
||||
.equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -258,4 +258,28 @@ rule "22: Redact Must Redact"
|
||||
Section(matchesType("must_redact"))
|
||||
then
|
||||
section.redact("must_redact", 22, "Must Redact found", "Article 39(1)(2) of Regulation (EC) No 178/2002");
|
||||
end
|
||||
|
||||
|
||||
rule "23: Redact signatures"
|
||||
when
|
||||
Section(matchesImageType("signature"))
|
||||
then
|
||||
section.redactImage("signature", 23, "Signature found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
|
||||
|
||||
rule "24: Redact formula"
|
||||
when
|
||||
Section(matchesImageType("formula"))
|
||||
then
|
||||
section.redactImage("formula", 24, "Formula found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
|
||||
|
||||
rule "25: Redact Logos"
|
||||
when
|
||||
Section(matchesImageType("logo"))
|
||||
then
|
||||
section.redactImage("logo", 25, "Logo found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
@ -304,4 +304,28 @@ rule "25: Redact Purity"
|
||||
Section(searchText.contains("purity"))
|
||||
then
|
||||
section.redactByRegEx("purity ?:? (([\\d\\.]+)( .{0,4}\\.)? ?%)", true, 1, "purity", 17, "Purity found", "Reg (EC) No 1107/2009 Art. 63 (2a)");
|
||||
end
|
||||
|
||||
|
||||
rule "26: Redact signatures"
|
||||
when
|
||||
Section(matchesImageType("signature"))
|
||||
then
|
||||
section.redactImage("signature", 26, "Signature found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
|
||||
|
||||
rule "27: Redact formula"
|
||||
when
|
||||
Section(matchesImageType("formula"))
|
||||
then
|
||||
section.redactImage("formula", 27, "Formula found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
|
||||
|
||||
rule "28: Redact Logos"
|
||||
when
|
||||
Section(matchesImageType("logo"))
|
||||
then
|
||||
section.redactImage("logo", 28, "Logo found", "Reg (EC) No 1107/2009 Art. 63 (2g)");
|
||||
end
|
||||
Loading…
x
Reference in New Issue
Block a user