diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java index 71278d58..0fb41529 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java @@ -4,11 +4,13 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.SectionGrid; -import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.model.SectionText; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import lombok.Data; import lombok.NoArgsConstructor; @@ -24,8 +26,8 @@ public class Document { private List unclassifiedTexts = new ArrayList<>(); private Map> entities = new HashMap<>(); private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter(); - private FloatFrequencyCounter fontSizeCounter= new FloatFrequencyCounter(); - private StringFrequencyCounter fontCounter= new StringFrequencyCounter(); + private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); + private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); private boolean headlines; @@ -35,4 +37,7 @@ public class Document { private long rulesVersion; private List sectionText = new ArrayList<>(); + + private Map> images = new HashMap<>(); + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java index 081bc187..07e6b6fa 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java @@ -3,6 +3,7 @@ package com.iqser.red.service.redaction.v1.server.classification.model; import java.util.ArrayList; import java.util.List; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @@ -12,9 +13,10 @@ import lombok.NoArgsConstructor; @Data @NoArgsConstructor -public class Paragraph { +public class Paragraph implements Comparable{ private List pageBlocks = new ArrayList<>(); + private List images = new ArrayList<>(); private String headline; @@ -53,4 +55,11 @@ public class Paragraph { return textBlocks; } + + @Override + public int compareTo(Object o) { + + return 0; + } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index 9563dadf..97a152f4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -67,8 +67,8 @@ public class RedactionController implements RedactionResource { log.info("Document structure analysis successful, starting redaction analysis..."); - entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); imageClassificationService.classifyImages(classifiedDoc); + entityRedactionService.processDocument(classifiedDoc, analyzeRequest.getRuleSetId(), analyzeRequest.getManualRedactions()); redactionLogCreatorService.createRedactionLog(classifiedDoc, pdDocument.getNumberOfPages(), analyzeRequest.getManualRedactions(), analyzeRequest .getRuleSetId()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java index 9e2c3be0..4b680c32 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java @@ -223,7 +223,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getWidth(), (float) imageBounds.getHeight()); if (rect.getHeight() > 2 && rect.getWidth() > 2) { - this.images.add(new PdfImage(pdfImage.getImage(), rect)); + this.images.add(new PdfImage(pdfImage.getImage(), rect, pageNumber)); } } } catch (Exception e) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java new file mode 100644 index 00000000..377fd55b --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java @@ -0,0 +1,26 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import java.awt.geom.Rectangle2D; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class Image { + + private String type; + private Rectangle2D position; + private boolean redaction; + private String redactionReason; + private String legalBasis; + private int matchedRule; + private int sectionNumber; + private String section; + private int page; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java index 856c54f0..86cabcfa 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java @@ -20,5 +20,9 @@ public class PdfImage { @NonNull private Rectangle2D position; private ImageType imageType; + private boolean isAppendedToParagraph; + + @NonNull + private int page; } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java index df867485..be141819 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java @@ -1,8 +1,10 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; @@ -18,6 +20,7 @@ public class ReanalysisSection { private List textBlocks; private Map tabularData = new HashMap<>(); private List cellStarts; + private Set images = new HashSet<>(); public SearchableText getSearchableText() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 4adceba4..8dc46fee 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -51,6 +51,9 @@ public class Section { private SearchableText searchableText; + @Builder.Default + private Set images = new HashSet<>(); + public boolean rowEquals(String headerName, String value) { @@ -75,6 +78,12 @@ public class Section { } + public boolean matchesImageType(String type) { + + return images.stream().anyMatch(image -> image.getType().equals(type)); + } + + public boolean headlineContainsWord(String word) { return StringUtils.containsIgnoreCase(headline, word); @@ -109,6 +118,19 @@ public class Section { } + public void redactImage(String type, int ruleNumber, String reason, String legalBasis) { + + images.forEach(image -> { + if (image.getType().equals(type)) { + image.setRedaction(true); + image.setMatchedRule(ruleNumber); + image.setRedactionReason(reason); + image.setLegalBasis(legalBasis); + } + }); + } + + public void redact(String type, int ruleNumber, String reason, String legalBasis) { boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); @@ -125,6 +147,18 @@ public class Section { } + public void redactNotImage(String type, int ruleNumber, String reason) { + + images.forEach(image -> { + if (image.getType().equals(type)) { + image.setRedaction(false); + image.setMatchedRule(ruleNumber); + image.setRedactionReason(reason); + } + }); + } + + public void redactNot(String type, int ruleNumber, String reason) { boolean hasRecommendationDictionary = dictionaryTypes.contains(RECOMMENDATION_PREFIX + type); @@ -140,7 +174,8 @@ public class Section { } - public void expandToHintAnnotationByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, String asType) { + public void expandToHintAnnotationByRegEx(String type, String pattern, boolean patternCaseInsensitive, int group, + String asType) { Pattern compiledPattern = Patterns.getCompiledPattern(pattern, patternCaseInsensitive); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 4b67065f..b26d2111 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -19,22 +20,25 @@ import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; import com.iqser.red.service.redaction.v1.model.ManualRedactions; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.model.SectionText; import com.iqser.red.service.redaction.v1.server.classification.model.Document; import com.iqser.red.service.redaction.v1.server.classification.model.Footer; import com.iqser.red.service.redaction.v1.server.classification.model.Header; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; -import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; import com.iqser.red.service.redaction.v1.server.redaction.model.Section; import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; -import com.iqser.red.service.redaction.v1.model.SectionText; import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @@ -89,7 +93,8 @@ public class EntityRedactionService { .computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity - .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity.getStart(), entity.getEnd())); + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd())); } } @@ -120,22 +125,24 @@ public class EntityRedactionService { sectionNumber.incrementAndGet(); } sectionSearchableTextPairs.add(processText(classifiedDoc, paragraph.getSearchableText(), paragraph.getTextBlocks(), paragraph - .getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + .getHeadline(), manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, paragraph + .getImages())); sectionNumber.incrementAndGet(); } for (Header header : classifiedDoc.getHeaders()) { - sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + sectionSearchableTextPairs.add(processText(classifiedDoc, header.getSearchableText(), header.getTextBlocks(), "Header", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>())); sectionNumber.incrementAndGet(); } for (Footer footer : classifiedDoc.getFooters()) { - sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + sectionSearchableTextPairs.add(processText(classifiedDoc, footer.getSearchableText(), footer.getTextBlocks(), "Footer", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>())); sectionNumber.incrementAndGet(); } for (UnclassifiedText unclassifiedText : classifiedDoc.getUnclassifiedTexts()) { - sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText.getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber)); + sectionSearchableTextPairs.add(processText(classifiedDoc, unclassifiedText.getSearchableText(), unclassifiedText + .getTextBlocks(), "", manualRedactions, sectionNumber, dictionary, local, hintsPerSectionNumber, new ArrayList<>())); sectionNumber.incrementAndGet(); } @@ -143,6 +150,10 @@ public class EntityRedactionService { Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair.getSection()); documentEntities.addAll(analysedRowSection.getEntities()); + for (Image image : analysedRowSection.getImages()) { + classifiedDoc.getImages().computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + analysedRowSection.getLocalDictionaryAdds().keySet().forEach(key -> { if (dictionary.isRecommendation(key)) { analysedRowSection.getLocalDictionaryAdds().get(key).forEach(value -> { @@ -172,7 +183,8 @@ public class EntityRedactionService { } - private List processTablePerRow(Document classifiedDoc, Table table, ManualRedactions manualRedactions, + private List processTablePerRow(Document classifiedDoc, Table table, + ManualRedactions manualRedactions, AtomicInteger sectionNumber, Dictionary dictionary, boolean local, Map> hintsPerSectionNumber) { @@ -192,7 +204,11 @@ public class EntityRedactionService { } SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks().get(0).getSequences().get(0).getPage()); + .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + .get(0) + .getSequences() + .get(0) + .getPage()); sectionText.getSectionAreas().add(sectionArea); addSectionToManualRedactions(cell.getTextBlocks(), manualRedactions, table.getHeadline(), sectionNumber.intValue()); @@ -237,7 +253,7 @@ public class EntityRedactionService { .dictionary(dictionary) .build(), searchableRow)); - if(!local) { + if (!local) { sectionText.setText(searchableRow.toString()); sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); @@ -252,7 +268,8 @@ public class EntityRedactionService { } - private List processTableAsOneText(Document classifiedDoc, Table table, ManualRedactions manualRedactions, + private List processTableAsOneText(Document classifiedDoc, Table table, + ManualRedactions manualRedactions, AtomicInteger sectionNumber, Dictionary dictionary, boolean local, Map> hintsPerSectionNumber) { @@ -266,9 +283,13 @@ public class EntityRedactionService { continue; } - if(!local) { + if (!local) { SectionArea sectionArea = new SectionArea(new Point((float) cell.getX(), (float) cell.getY()), (float) cell - .getWidth(), (float) cell.getHeight(), cell.getTextBlocks().get(0).getSequences().get(0).getPage()); + .getWidth(), (float) cell.getHeight(), cell.getTextBlocks() + .get(0) + .getSequences() + .get(0) + .getPage()); sectionText.getSectionAreas().add(sectionArea); } @@ -279,7 +300,6 @@ public class EntityRedactionService { } } - Set rowEntities = findEntities(entireTableText, table.getHeadline(), sectionNumber.intValue(), dictionary, local); surroundingWordsService.addSurroundingText(rowEntities, entireTableText, dictionary); @@ -297,7 +317,7 @@ public class EntityRedactionService { .dictionary(dictionary) .build(), entireTableText)); - if(!local) { + if (!local) { sectionText.setText(entireTableText.toString()); sectionText.setHeadline(table.getHeadline()); sectionText.setSectionNumber(sectionNumber.intValue()); @@ -309,12 +329,14 @@ public class EntityRedactionService { } - private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText, List paragraphTextBlocks, - String headline, ManualRedactions manualRedactions, - AtomicInteger sectionNumber, Dictionary dictionary, boolean local, - Map> hintsPerSectionNumber) { + private SectionSearchableTextPair processText(Document classifiedDoc, SearchableText searchableText, + List paragraphTextBlocks, String headline, + ManualRedactions manualRedactions, AtomicInteger sectionNumber, + Dictionary dictionary, boolean local, + Map> hintsPerSectionNumber, + List images) { - if(!local) { + if (!local) { SectionText sectionText = new SectionText(); for (TextBlock paragraphTextBlock : paragraphTextBlocks) { SectionArea sectionArea = new SectionArea(new Point(paragraphTextBlock.getMinX(), paragraphTextBlock.getMinY()), paragraphTextBlock @@ -345,12 +367,15 @@ public class EntityRedactionService { .sectionNumber(sectionNumber.intValue()) .searchableText(searchableText) .dictionary(dictionary) + .images(images.stream() + .map(image -> convert(image, sectionNumber.intValue(), headline)) + .collect(Collectors.toSet())) .build(), searchableText); } public Set findEntities(SearchableText searchableText, String headline, int sectionNumber, - Dictionary dictionary, boolean local) { + Dictionary dictionary, boolean local) { Set found = new HashSet<>(); String searchableString = searchableText.toString(); @@ -390,4 +415,18 @@ public class EntityRedactionService { } } + + private Image convert(PdfImage pdfImage, int sectionNumber, String headline) { + + return Image.builder() + .type(pdfImage.getImageType().equals(ImageType.OTHER) ? "image" : pdfImage.getImageType() + .name() + .toLowerCase(Locale.ROOT)) + .position(pdfImage.getPosition()) + .sectionNumber(sectionNumber) + .section(headline) + .page(pdfImage.getPage()) + .build(); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java index 6d28bfb2..94dc3a94 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ImageClassificationService.java @@ -32,7 +32,7 @@ public class ImageClassificationService { classifiedDoc.getPages().forEach(page -> { page.getImages().forEach(image -> { - if(settings.isEnableImageClassification()) { + if (settings.isEnableImageClassification()) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { ImageIO.write(image.getImage(), "png", baos); ImageClassificationResponse response = imageClassificationClient.classify(new MockMultipartFile("file", "Image.png", "image/png", baos diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java index 8bd55e1c..5ea56c79 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -39,6 +39,7 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection; import com.iqser.red.service.redaction.v1.server.redaction.model.Section; import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; @@ -75,10 +76,14 @@ public class ReanalyzeService { } Set sectionsToReanaylse = new HashSet<>(); + Map> imageEntries = new HashMap<>(); for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) { if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) { sectionsToReanaylse.add(entry.getSectionNumber()); } + if (entry.isImage() || entry.getType().equals("image")) { + imageEntries.computeIfAbsent(entry.getSectionNumber(), x -> new HashSet<>()).add(convert(entry)); + } } for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) { @@ -173,10 +178,15 @@ public class ReanalyzeService { } reanalysisSection.setTextBlocks(textBlocks); reanalysisSection.setTabularData(tabularData); - reanalysisSections.add(reanalysisSection); + if (sectionText.isTable()) { reanalysisSection.setCellStarts(cellStarts); } + if (imageEntries.containsKey(sectionText.getSectionNumber())) { + reanalysisSection.getImages().addAll(imageEntries.get(sectionText.getSectionNumber())); + } + + reanalysisSections.add(reanalysisSection); } //-- @@ -208,14 +218,22 @@ public class ReanalyzeService { .tabularData(reanalysisSection.getTabularData()) .searchableText(reanalysisSection.getSearchableText()) .dictionary(dictionary) + .images(reanalysisSection.getImages()) .build(), reanalysisSection.getSearchableText())); } Set entities = new HashSet<>(); + Map> imagesPerPage = new HashMap<>(); sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair .getSection()); entities.addAll(analysedRowSection.getEntities()); + EntitySearchUtils.removeEntitiesContainedInLarger(entities); + + for (Image image : analysedRowSection.getImages()) { + imagesPerPage.computeIfAbsent(image.getPage(), (a) -> new HashSet<>()).add(image); + } + }); Map> entitiesPerPage = new HashMap<>(); @@ -241,6 +259,12 @@ public class ReanalyzeService { newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); } + + if (imagesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addImageEntries(imagesPerPage, renalyzeRequest + .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); + } + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest .getRuleSetId())); } @@ -248,12 +272,13 @@ public class ReanalyzeService { Iterator itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator(); while (itty.hasNext()) { RedactionLogEntry entry = itty.next(); - if (sectionsToReanaylse.contains(entry.getSectionNumber()) && !entry.isImage() || entry.getSectionNumber() == 0 && !entry.isImage()) { + if (sectionsToReanaylse.contains(entry.getSectionNumber())) { itty.remove(); } } renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries); + renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build(); @@ -277,4 +302,19 @@ public class ReanalyzeService { .collect(Collectors.toSet()); } + + public Image convert(RedactionLogEntry entry) { + + Rectangle position = entry.getPositions().get(0); + + return Image.builder() + .type(entry.getType()) + .position(new Rectangle2D.Float(position.getTopLeft().getX(), position.getTopLeft() + .getY(), position.getWidth(), position.getHeight())) + .sectionNumber(entry.getSectionNumber()) + .section(entry.getSection()) + .page(position.getPage()) + .build(); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index 32fe5dc7..44a8941a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -3,7 +3,6 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -30,8 +29,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; import com.iqser.red.service.redaction.v1.server.redaction.utils.IdBuilder; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; @@ -65,47 +63,101 @@ public class RedactionLogCreatorService { .addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId)); } - if (!classifiedDoc.getPages().get(page - 1).getImages().isEmpty()) { - addImageEntries(classifiedDoc, page, ruleSetId); + if (classifiedDoc.getImages().get(page) != null && !classifiedDoc.getImages().get(page).isEmpty()) { + classifiedDoc.getRedactionLogEntities() + .addAll(addImageEntries(classifiedDoc.getImages(), manualRedactions, page, ruleSetId)); } } } - private void addImageEntries(Document classifiedDoc, int pageNumber, String ruleSetId) { + public List addImageEntries(Map> images, ManualRedactions manualRedactions, + int pageNumber, String ruleSetId) { + + List redactionLogEntities = new ArrayList<>(); + + for (Image image : images.get(pageNumber)) { + + String id = IdBuilder.buildId(image.getPosition(), pageNumber); - for (PdfImage image : classifiedDoc.getPages().get(pageNumber - 1).getImages()) { RedactionLogEntry redactionLogEntry = RedactionLogEntry.builder() - .id(IdBuilder.buildId(image.getPosition(), pageNumber)) - .color(getColor(image.getImageType().name().toLowerCase(Locale.ROOT), ruleSetId)) + .id(id) + .color(getColorForImage(image, ruleSetId, false)) .isImage(true) - .type(image.getImageType().equals(ImageType.OTHER) ? "image" : image.getImageType().name().toLowerCase(Locale.ROOT)) - .redacted(isImageRedactionType(image.getImageType())) - .isHint(!isImageRedactionType(image.getImageType())) + .type(image.getType()) + .redacted(image.isRedaction()) + .reason(image.getRedactionReason()) + .legalBasis(image.getLegalBasis()) + .matchedRule(image.getMatchedRule()) + .isHint(dictionaryService.isHint(image.getType(), ruleSetId)) .manual(false) .isDictionaryEntry(false) .isRecommendation(false) .positions(List.of(new Rectangle(new Point((float) image.getPosition() .getX(), (float) image.getPosition().getY()), (float) image.getPosition() .getWidth(), (float) image.getPosition().getHeight(), pageNumber))) + .sectionNumber(image.getSectionNumber()) + .section(image.getSection()) .build(); - classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); - } - } + if (manualRedactions != null && !manualRedactions.getIdsToRemove().isEmpty()) { + for (IdRemoval manualRemoval : manualRedactions.getIdsToRemove()) { + if (manualRemoval.getId().equals(id)) { + String manualOverrideReason = null; + if (manualRemoval.getStatus().equals(Status.APPROVED)) { + image.setRedaction(false); + redactionLogEntry.setRedacted(false); + redactionLogEntry.setStatus(Status.APPROVED); + manualOverrideReason = image.getRedactionReason() + ", removed by manual override"; + redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false)); + } else if (manualRemoval.getStatus().equals(Status.REQUESTED)) { + manualOverrideReason = image.getRedactionReason() + ", requested to remove"; + redactionLogEntry.setStatus(Status.REQUESTED); + redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true)); + } else { + redactionLogEntry.setStatus(Status.DECLINED); + } - private boolean isImageRedactionType(ImageType imageType) { + image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason()); + redactionLogEntry.setReason(manualOverrideReason); + redactionLogEntry.setManual(true); + redactionLogEntry.setManualRedactionType(ManualRedactionType.REMOVE); + } + } + } - if (imageType.equals(ImageType.LOGO)) { - return true; + if (manualRedactions != null && !manualRedactions.getForceRedacts().isEmpty()) { + for (ManualForceRedact manualForceRedact : manualRedactions.getForceRedacts()) { + if (manualForceRedact.getId().equals(id)) { + String manualOverrideReason = null; + if (manualForceRedact.getStatus().equals(Status.APPROVED)) { + image.setRedaction(true); + redactionLogEntry.setRedacted(true); + redactionLogEntry.setStatus(Status.APPROVED); + redactionLogEntry.setColor(getColorForImage(image, ruleSetId, false)); + manualOverrideReason = image.getRedactionReason() + ", forced by manual override"; + redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis()); + } else if (manualForceRedact.getStatus().equals(Status.REQUESTED)) { + manualOverrideReason = image.getRedactionReason() + ", requested to force redact"; + redactionLogEntry.setStatus(Status.REQUESTED); + redactionLogEntry.setColor(getColorForImage(image, ruleSetId, true)); + redactionLogEntry.setLegalBasis(manualForceRedact.getLegalBasis()); + } else { + redactionLogEntry.setStatus(Status.DECLINED); + } + + image.setRedactionReason(manualOverrideReason != null ? manualOverrideReason : image.getRedactionReason()); + redactionLogEntry.setReason(manualOverrideReason); + redactionLogEntry.setManual(true); + redactionLogEntry.setManualRedactionType(ManualRedactionType.FORCE_REDACT); + } + } + } + + redactionLogEntities.add(redactionLogEntry); } - if (imageType.equals(ImageType.FORMULA)) { - return true; - } - if (imageType.equals(ImageType.SIGNATURE)) { - return true; - } - return false; + + return redactionLogEntities; } @@ -372,6 +424,18 @@ public class RedactionLogCreatorService { } + private float[] getColorForImage(Image image, String ruleSetId, boolean requestedToRemove) { + + if (requestedToRemove) { + return dictionaryService.getRequestRemoveColor(ruleSetId); + } + if (!image.isRedaction() && !dictionaryService.isHint(image.getType(), ruleSetId)) { + return dictionaryService.getNotRedactedColor(ruleSetId); + } + return dictionaryService.getColor(image.getType(), ruleSetId); + } + + private boolean isHint(Entity entity, String ruleSetId) { return dictionaryService.isHint(entity.getType(), ruleSetId); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index 27c3bd52..5d88c0cb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -71,6 +71,7 @@ public class PdfSegmentationService { page.setPageNumber(pageNumber); increaseDocumentStatistics(page, document); + page.setImages(stripper.getImages()); pages.add(page); } @@ -78,8 +79,8 @@ public class PdfSegmentationService { document.setPages(pages); classificationService.classifyDocument(document); - sectionsBuilderService.buildSections(document); + sectionsBuilderService.addImagesToSections(document); return document; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 73741a5f..41a8cf6f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -2,8 +2,12 @@ package com.iqser.red.service.redaction.v1.server.segmentation; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.stream.Collectors; import org.apache.commons.collections4.CollectionUtils; @@ -16,6 +20,7 @@ import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; @@ -85,13 +90,13 @@ public class SectionsBuilderService { prev = current; } - if(!header.isEmpty()) { + if (!header.isEmpty()) { headers.add(new Header(header)); } - if(!footer.isEmpty()) { + if (!footer.isEmpty()) { footers.add(new Footer(footer)); } - if(!unclassifiedText.isEmpty()) { + if (!unclassifiedText.isEmpty()) { unclassifiedTexts.add(new UnclassifiedText(unclassifiedText)); } } @@ -107,6 +112,53 @@ public class SectionsBuilderService { } + public void addImagesToSections(Document document) { + + Map> paragraphMap = new HashMap<>(); + for (Paragraph paragraph : document.getParagraphs()) { + for (AbstractTextContainer container : paragraph.getPageBlocks()) { + paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph); + } + } + + for (Page page : document.getPages()) { + for (PdfImage image : page.getImages()) { + SortedSet paragraphsOnPage = paragraphMap.get(page.getPageNumber()); + if (paragraphsOnPage == null) { + int i = page.getPageNumber(); + while (paragraphsOnPage == null) { + paragraphsOnPage = paragraphMap.get(i); + i--; + } + } + + Float perviousEnd = 0f; + for (Paragraph paragraph : paragraphsOnPage) { + Float currentEnd = 0f; + for (AbstractTextContainer abs : paragraph.getPageBlocks()) { + if (abs.getPage() != page.getPageNumber()) { + continue; + } + if (abs.getMaxY() > currentEnd) { + currentEnd = abs.getMaxY(); + } + } + + if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) { + paragraph.getImages().add(image); + image.setAppendedToParagraph(true); + } + perviousEnd = currentEnd; + } + if (!image.isAppendedToParagraph()) { + paragraphsOnPage.first().getImages().add(image); + image.setAppendedToParagraph(true); + } + } + } + } + + private void mergeTableMetadata(Table currentTable, Table previousTable) { // Distribute header information for subsequent tables diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index f48cd396..0c459750 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -4,12 +4,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT; -import java.awt.Color; -import java.awt.geom.AffineTransform; -import java.awt.geom.Rectangle2D; import java.io.BufferedReader; import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -21,7 +17,6 @@ import java.nio.charset.StandardCharsets; import java.time.OffsetDateTime; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -29,15 +24,7 @@ import java.util.UUID; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.text.PDFTextStripperByArea; -import org.apache.pdfbox.util.Matrix; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -45,7 +32,6 @@ import org.kie.api.builder.KieBuilder; import org.kie.api.builder.KieFileSystem; import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; -import org.mockito.MockitoAnnotations; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.TestConfiguration; @@ -77,23 +63,14 @@ import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; -import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.model.SectionText; import com.iqser.red.service.redaction.v1.model.Status; -import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.controller.RedactionController; -import com.iqser.red.service.redaction.v1.server.exception.RedactionException; -import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; -import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; @RunWith(SpringRunner.class) @SpringBootTest(webEnvironment = RANDOM_PORT) @@ -112,6 +89,10 @@ public class RedactionIntegrationTest { private static final String TEST_METHOD = "test_method"; private static final String PURITY = "purity"; private static final String IMAGE = "image"; + private static final String LOGO = "logo"; + private static final String SIGNATURE = "signature"; + private static final String FORMULA = "formula"; + private static final String OCR = "ocr"; private static final String RECOMMENDATION_AUTHOR = "recommendation_CBI_author"; private static final String RECOMMENDATION_ADDRESS = "recommendation_CBI_address"; @@ -196,6 +177,10 @@ public class RedactionIntegrationTest { when(dictionaryClient.getDictionaryForType(FALSE_POSITIVE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FALSE_POSITIVE)); when(dictionaryClient.getDictionaryForType(PURITY, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(PURITY)); when(dictionaryClient.getDictionaryForType(IMAGE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(IMAGE)); + when(dictionaryClient.getDictionaryForType(OCR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(OCR)); + when(dictionaryClient.getDictionaryForType(LOGO, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(LOGO)); + when(dictionaryClient.getDictionaryForType(SIGNATURE, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(SIGNATURE)); + when(dictionaryClient.getDictionaryForType(FORMULA, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(FORMULA)); when(dictionaryClient.getColors(TEST_RULESET_ID)).thenReturn(colors); } @@ -278,7 +263,27 @@ public class RedactionIntegrationTest { .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); dictionary.computeIfAbsent(IMAGE, v -> new ArrayList<>()) - .addAll(ResourceLoader.load("dictionaries/image.txt") + .addAll(ResourceLoader.load("dictionaries/empty.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); + dictionary.computeIfAbsent(OCR, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/empty.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); + dictionary.computeIfAbsent(LOGO, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/empty.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); + dictionary.computeIfAbsent(SIGNATURE, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/empty.txt") + .stream() + .map(this::cleanDictionaryEntry) + .collect(Collectors.toSet())); + dictionary.computeIfAbsent(FORMULA, v -> new ArrayList<>()) + .addAll(ResourceLoader.load("dictionaries/empty.txt") .stream() .map(this::cleanDictionaryEntry) .collect(Collectors.toSet())); @@ -309,6 +314,10 @@ public class RedactionIntegrationTest { typeColorMap.put(FALSE_POSITIVE, "#ffffff"); typeColorMap.put(PURITY, "#ffe187"); typeColorMap.put(IMAGE, "#fcc5fb"); + typeColorMap.put(OCR, "#fcc5fb"); + typeColorMap.put(LOGO, "#ffe187"); + typeColorMap.put(FORMULA, "#ffe187"); + typeColorMap.put(SIGNATURE, "#ffe187"); hintTypeMap.put(VERTEBRATE, true); hintTypeMap.put(ADDRESS, false); @@ -326,6 +335,10 @@ public class RedactionIntegrationTest { hintTypeMap.put(FALSE_POSITIVE, true); hintTypeMap.put(PURITY, false); hintTypeMap.put(IMAGE, true); + hintTypeMap.put(OCR, true); + hintTypeMap.put(FORMULA, false); + hintTypeMap.put(LOGO, false); + hintTypeMap.put(SIGNATURE, false); caseInSensitiveMap.put(VERTEBRATE, true); caseInSensitiveMap.put(ADDRESS, false); @@ -343,6 +356,10 @@ public class RedactionIntegrationTest { caseInSensitiveMap.put(FALSE_POSITIVE, false); caseInSensitiveMap.put(PURITY, false); caseInSensitiveMap.put(IMAGE, true); + caseInSensitiveMap.put(OCR, true); + caseInSensitiveMap.put(SIGNATURE, true); + caseInSensitiveMap.put(LOGO, true); + caseInSensitiveMap.put(FORMULA, true); recommendationTypeMap.put(VERTEBRATE, false); recommendationTypeMap.put(ADDRESS, false); @@ -360,6 +377,10 @@ public class RedactionIntegrationTest { recommendationTypeMap.put(FALSE_POSITIVE, false); recommendationTypeMap.put(PURITY, false); recommendationTypeMap.put(IMAGE, false); + recommendationTypeMap.put(OCR, false); + recommendationTypeMap.put(FORMULA, false); + recommendationTypeMap.put(SIGNATURE, false); + recommendationTypeMap.put(LOGO, false); rankTypeMap.put(FALSE_POSITIVE, 160); rankTypeMap.put(PURITY, 155); @@ -377,6 +398,10 @@ public class RedactionIntegrationTest { rankTypeMap.put(RECOMMENDATION_AUTHOR, 40); rankTypeMap.put(RECOMMENDATION_ADDRESS, 30); rankTypeMap.put(IMAGE, 30); + rankTypeMap.put(OCR, 29); + rankTypeMap.put(LOGO, 28); + rankTypeMap.put(SIGNATURE, 27); + rankTypeMap.put(FORMULA, 26); colors.setDefaultColor("#acfc00"); colors.setNotRedacted("#cccccc"); @@ -563,7 +588,6 @@ public class RedactionIntegrationTest { ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() .redactionLog(result.getRedactionLog()) .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .manualRedactions(null) .text(result.getText()) .ruleSetId(TEST_RULESET_ID) .build()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index bee1719a..4f58b26d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -2,12 +2,17 @@ package com.iqser.red.service.redaction.v1.server.segmentation; import static org.assertj.core.api.Assertions.assertThat; +import java.io.ByteArrayOutputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +import javax.imageio.ImageIO; + import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.runtime.KieContainer; @@ -18,7 +23,10 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit4.SpringRunner; import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; +import com.iqser.red.service.redaction.v1.server.redaction.model.Image; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; @@ -45,24 +53,25 @@ public class PdfSegmentationServiceTest { @Test - public void testPDFSegmentationWithComplexTable() throws IOException { + @Ignore + public void testExtractImages() throws IOException { - ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf"); try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { Document document = pdfSegmentationService.parseDocument(pdDocument); - assertThat(document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList())).isNotEmpty(); - Table table = document.getParagraphs() - .stream() - .flatMap(paragraph -> paragraph.getTables().stream()) - .collect(Collectors.toList()) - .get(0); - assertThat(table.getColCount()).isEqualTo(6); - assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + int i = 0; + for (Page page : document.getPages()) { + for (PdfImage image : page.getImages()) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + ImageIO.write(image.getImage(), "png", baos); + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Image " + i + ".png")) { + fileOutputStream.write(baos.toByteArray()); + } + } + i++; + } + } } } @@ -97,12 +106,12 @@ public class PdfSegmentationServiceTest { .stream() .map(Collections::singletonList) .collect(Collectors.toList()); - assertThat(secondTable.getRows().stream() + assertThat(secondTable.getRows() + .stream() .allMatch(row -> row.stream() .map(Cell::getHeaderCells) .collect(Collectors.toList()) - .equals(firstTableHeaderCells))) - .isTrue(); + .equals(firstTableHeaderCells))).isTrue(); } } @@ -137,12 +146,12 @@ public class PdfSegmentationServiceTest { .stream() .map(Cell::getHeaderCells) .collect(Collectors.toList()); - assertThat(secondTable.getRows().stream() + assertThat(secondTable.getRows() + .stream() .allMatch(row -> row.stream() .map(Cell::getHeaderCells) .collect(Collectors.toList()) - .equals(firstTableHeaderCells))) - .isTrue(); + .equals(firstTableHeaderCells))).isTrue(); } } @@ -177,12 +186,12 @@ public class PdfSegmentationServiceTest { .stream() .map(Collections::singletonList) .collect(Collectors.toList()); - assertThat(secondTable.getRows().stream() + assertThat(secondTable.getRows() + .stream() .allMatch(row -> row.stream() .map(Cell::getHeaderCells) .collect(Collectors.toList()) - .equals(firstTableHeaderCells))) - .isTrue(); + .equals(firstTableHeaderCells))).isTrue(); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/image.txt b/redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/empty.txt similarity index 100% rename from redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/image.txt rename to redaction-service-v1/redaction-service-server-v1/src/test/resources/dictionaries/empty.txt diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl index cc6e709c..6d05e185 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl @@ -258,4 +258,28 @@ rule "22: Redact Must Redact" Section(matchesType("must_redact")) then section.redact("must_redact", 22, "Must Redact found", "Article 39(1)(2) of Regulation (EC) No 178/2002"); + end + + +rule "23: Redact signatures" + when + Section(matchesImageType("signature")) + then + section.redactImage("signature", 23, "Signature found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end + + +rule "24: Redact formula" + when + Section(matchesImageType("formula")) + then + section.redactImage("formula", 24, "Formula found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end + + +rule "25: Redact Logos" + when + Section(matchesImageType("logo")) + then + section.redactImage("logo", 25, "Logo found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); end \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl index a028a45f..a2a78200 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl +++ b/redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules.drl @@ -304,4 +304,28 @@ rule "25: Redact Purity" Section(searchText.contains("purity")) then section.redactByRegEx("purity ?:? (([\\d\\.]+)( .{0,4}\\.)? ?%)", true, 1, "purity", 17, "Purity found", "Reg (EC) No 1107/2009 Art. 63 (2a)"); + end + + +rule "26: Redact signatures" + when + Section(matchesImageType("signature")) + then + section.redactImage("signature", 26, "Signature found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end + + +rule "27: Redact formula" + when + Section(matchesImageType("formula")) + then + section.redactImage("formula", 27, "Formula found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); + end + + +rule "28: Redact Logos" + when + Section(matchesImageType("logo")) + then + section.redactImage("logo", 28, "Logo found", "Reg (EC) No 1107/2009 Art. 63 (2g)"); end \ No newline at end of file