diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ReanalyzeResult.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ReanalyzeResult.java new file mode 100644 index 00000000..963f3d70 --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/ReanalyzeResult.java @@ -0,0 +1,15 @@ +package com.iqser.red.service.redaction.v1.model; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class ReanalyzeResult { + + private RedactionLog redactionLog; +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java new file mode 100644 index 00000000..e11fee5d --- /dev/null +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/RenalyzeRequest.java @@ -0,0 +1,22 @@ +package com.iqser.red.service.redaction.v1.model; + +import java.time.OffsetDateTime; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class RenalyzeRequest { + + private byte[] document; + private String ruleSetId; + private ManualRedactions manualRedactions; + private Text text; + private RedactionLog redactionLog; + private OffsetDateTime lastProcessed; +} diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java index 0944bcd0..3e02dce8 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/model/SectionArea.java @@ -26,4 +26,8 @@ public class SectionArea { private String header; + public boolean contains(Rectangle other) { + return page == other.getPage() && this.topLeft.getX() <= other.getTopLeft().getX() && this.topLeft.getX() + this.getWidth() >= other.getTopLeft().getX() + other.getWidth() && this.getTopLeft().getY() <= other.getTopLeft().getY() && this.getTopLeft().getY() + this.getHeight() >= other.getTopLeft().getY() + other.getHeight(); + } + } diff --git a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java index 8dfc4ebb..b58dcd9c 100644 --- a/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java +++ b/redaction-service-v1/redaction-service-api-v1/src/main/java/com/iqser/red/service/redaction/v1/resources/RedactionResource.java @@ -4,8 +4,11 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; import com.iqser.red.service.redaction.v1.model.AnalyzeResult; import com.iqser.red.service.redaction.v1.model.AnnotateRequest; import com.iqser.red.service.redaction.v1.model.AnnotateResponse; +import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; +import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; + import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PostMapping; @@ -21,6 +24,9 @@ public interface RedactionResource { @PostMapping(value = "/analyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) AnalyzeResult analyze(@RequestBody AnalyzeRequest analyzeRequest); + @PostMapping(value = "/reanalyze", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) + ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest); + @PostMapping(value = "/annotate", produces = MediaType.APPLICATION_JSON_VALUE, consumes = MediaType.APPLICATION_JSON_VALUE) AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest); diff --git a/redaction-service-v1/redaction-service-server-v1/pom.xml b/redaction-service-v1/redaction-service-server-v1/pom.xml index 6e8610fa..fba190d6 100644 --- a/redaction-service-v1/redaction-service-server-v1/pom.xml +++ b/redaction-service-v1/redaction-service-server-v1/pom.xml @@ -20,7 +20,7 @@ com.iqser.red.service configuration-service-api-v1 - 2.0.0 + 2.2.9 org.drools diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java index f3d41ab2..a9110337 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RedactionController.java @@ -4,10 +4,12 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeRequest; import com.iqser.red.service.redaction.v1.model.AnalyzeResult; import com.iqser.red.service.redaction.v1.model.AnnotateRequest; import com.iqser.red.service.redaction.v1.model.AnnotateResponse; +import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; import com.iqser.red.service.redaction.v1.model.RedactionLog; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; +import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; import com.iqser.red.service.redaction.v1.model.SectionGrid; import com.iqser.red.service.redaction.v1.model.Text; import com.iqser.red.service.redaction.v1.resources.RedactionResource; @@ -18,13 +20,16 @@ import com.iqser.red.service.redaction.v1.server.redaction.service.AnnotationSer import com.iqser.red.service.redaction.v1.server.redaction.service.DictionaryService; import com.iqser.red.service.redaction.v1.server.redaction.service.DroolsExecutionService; import com.iqser.red.service.redaction.v1.server.redaction.service.EntityRedactionService; +import com.iqser.red.service.redaction.v1.server.redaction.service.ReanalyzeService; import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionLogCreatorService; import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.visualization.service.PdfVisualisationService; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; + import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RequestBody; @@ -47,6 +52,7 @@ public class RedactionController implements RedactionResource { private final DroolsExecutionService droolsExecutionService; private final DictionaryService dictionaryService; private final AnnotationService annotationService; + private final ReanalyzeService reanalyzeService; @Override @@ -68,7 +74,7 @@ public class RedactionController implements RedactionResource { return AnalyzeResult.builder() .sectionGrid(classifiedDoc.getSectionGrid()) .redactionLog(new RedactionLog(classifiedDoc.getRedactionLogEntities(), classifiedDoc.getDictionaryVersion(), classifiedDoc - .getRulesVersion(), analyzeRequest.getRuleSetId())) + .getRulesVersion(), analyzeRequest.getRuleSetId())) .numberOfPages(classifiedDoc.getPages().size()) .text(new Text(classifiedDoc.getSectionText())) .build(); @@ -80,6 +86,12 @@ public class RedactionController implements RedactionResource { } + public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) { + + return reanalyzeService.reanalyze(renalyzeRequest); + } + + public AnnotateResponse annotate(@RequestBody AnnotateRequest annotateRequest) { try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(annotateRequest.getDocument()))) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java new file mode 100644 index 00000000..7e2e56c8 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java @@ -0,0 +1,83 @@ +package com.iqser.red.service.redaction.v1.server.parsing; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.text.PDFTextStripperByArea; +import org.apache.pdfbox.text.TextPosition; + +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; + +import lombok.Getter; +import lombok.Setter; + +public class PDFAreaTextStripper extends PDFTextStripperByArea { + + @Getter + private List textPositionSequences = new ArrayList<>(); + + @Setter + private int pageNumber; + + public PDFAreaTextStripper() throws IOException { + + } + + @Override + public void writeString(String text, List textPositions) throws IOException { + + int startIndex = 0; + for (int i = 0; i <= textPositions.size() - 1; i++) { + + if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) + .getUnicode() + .equals("\u00A0"))) { + startIndex++; + continue; + } + + // Strange but sometimes this is happening, for example: Metolachlor2.pdf + if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) + .getUnicode() + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i; + } + + if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i) + .getUnicode() + .equals("\u00A0")) && i <= textPositions.size() - 2) { + List sublist = textPositions.subList(startIndex, i); + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0) + .getUnicode() + .equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + startIndex = i + 1; + } + } + + List sublist = textPositions.subList(startIndex, textPositions.size()); + if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1) + .getUnicode() + .equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) { + sublist = sublist.subList(0, sublist.size() - 1); + } + if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0) + .getUnicode() + .equals("\u00A0")))) { + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + } + super.writeString(text); + } + + + public void clearPositions(){ + textPositionSequences = new ArrayList<>(); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java new file mode 100644 index 00000000..86362741 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryIncrement.java @@ -0,0 +1,15 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import java.util.Set; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class DictionaryIncrement { + + private Set values; + private long dictionaryVersion; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java index dd8aecc1..c4b0ce7c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/DictionaryModel.java @@ -3,6 +3,9 @@ package com.iqser.red.service.redaction.v1.server.redaction.model; import java.io.Serializable; import java.util.Set; +import java.util.stream.Collectors; + +import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; import lombok.AllArgsConstructor; import lombok.Data; @@ -17,11 +20,12 @@ public class DictionaryModel implements Serializable { private boolean caseInsensitive; private boolean hint; private boolean recommendation; - private Set entries; + private Set entries; private Set localEntries; public Set getValues(boolean local){ - return local ? localEntries : entries; + return local ? localEntries : entries.stream().filter(e -> !e.isDeleted()).map(e-> e.getValue()).collect(Collectors + .toSet()); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java new file mode 100644 index 00000000..df867485 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReanalysisSection.java @@ -0,0 +1,34 @@ +package com.iqser.red.service.redaction.v1.server.redaction.model; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; + +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +public class ReanalysisSection { + + private int sectionNumber; + private String headline; + private List textBlocks; + private Map tabularData = new HashMap<>(); + private List cellStarts; + + + public SearchableText getSearchableText() { + + SearchableText searchableText = new SearchableText(); + textBlocks.forEach(block -> { + if (block instanceof TextBlock) { + searchableText.addAll(block.getSequences()); + } + }); + return searchableText; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java index 0df2ca2d..e622a013 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/DictionaryService.java @@ -1,42 +1,45 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; -import com.iqser.red.service.configuration.v1.api.model.Colors; -import com.iqser.red.service.configuration.v1.api.model.TypeResponse; -import com.iqser.red.service.configuration.v1.api.model.TypeResult; -import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; -import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation; -import feign.FeignException; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections4.CollectionUtils; -import org.apache.commons.lang3.SerializationUtils; -import org.springframework.stereotype.Service; - import java.awt.Color; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.SerializationUtils; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.configuration.v1.api.model.Colors; +import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; +import com.iqser.red.service.configuration.v1.api.model.TypeResponse; +import com.iqser.red.service.configuration.v1.api.model.TypeResult; +import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryModel; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryRepresentation; + +import feign.FeignException; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + @Slf4j @Service @RequiredArgsConstructor public class DictionaryService { - private final DictionaryClient dictionaryClient; - private Map dictionariesByRuleSets = new HashMap<>(); - public void updateDictionary(String ruleSetId) { + public long updateDictionary(String ruleSetId) { long version = dictionaryClient.getVersion(ruleSetId); @@ -45,6 +48,26 @@ public class DictionaryService { if (foundDictionary == null || version > foundDictionary.getDictionaryVersion()) { updateDictionaryEntry(ruleSetId, version); } + + return version; + } + + + public DictionaryIncrement getDictionaryIncrements(String ruleSetId, long fromVersion) { + + long version = updateDictionary(ruleSetId); + + Set newValues = new HashSet<>(); + List dictionaryModels = dictionariesByRuleSets.get(ruleSetId).getDictionary(); + dictionaryModels.forEach(dictionaryModel -> { + dictionaryModel.getEntries().forEach(dictionaryEntry -> { + if (dictionaryEntry.getVersion() > fromVersion) { + newValues.add(dictionaryEntry.getValue()); + } + }); + }); + + return new DictionaryIncrement(newValues, version); } @@ -63,7 +86,6 @@ public class DictionaryService { .sorted(Comparator.comparingInt(DictionaryModel::getRank).reversed()) .collect(Collectors.toList()); - dictionary.forEach(dm -> dictionaryRepresentation.getLocalAccessMap().put(dm.getType(), dm)); Colors colors = dictionaryClient.getColors(ruleSetId); @@ -86,6 +108,7 @@ public class DictionaryService { public void updateExternalDictionary(Dictionary dictionary, String ruleSetId) { + dictionary.getDictionaryModels().forEach(dm -> { if (dm.isRecommendation() && !dm.getLocalEntries().isEmpty()) { dictionaryClient.addEntries(dm.getType(), ruleSetId, new ArrayList<>(dm.getLocalEntries()), false); @@ -98,17 +121,15 @@ public class DictionaryService { } - private Set convertEntries(TypeResult t) { + private Set convertEntries(TypeResult t) { + + Set entries = new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId()) + .getEntries()); if (t.isCaseInsensitive()) { - return dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId()) - .getEntries() - .stream() - .map(String::toLowerCase) - .collect(Collectors.toSet()); - } else { - return new HashSet<>(dictionaryClient.getDictionaryForType(t.getType(), t.getRuleSetId()).getEntries()); + entries.forEach(entry -> entry.getValue().toLowerCase(Locale.ROOT)); } + return entries; } @@ -148,6 +169,7 @@ public class DictionaryService { return false; } + public boolean isRecommendation(String type, String ruleSetId) { DictionaryModel model = dictionariesByRuleSets.get(ruleSetId).getLocalAccessMap().get(type); @@ -159,6 +181,7 @@ public class DictionaryService { public Dictionary getDeepCopyDictionary(String ruleSetId) { + List copy = new ArrayList<>(); var representation = dictionariesByRuleSets.get(ruleSetId); @@ -170,15 +193,22 @@ public class DictionaryService { return new Dictionary(copy, representation.getDictionaryVersion()); } + public float[] getRequestRemoveColor(String ruleSetId) { + return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor(); } + public float[] getNotRedactedColor(String ruleSetId) { + return dictionariesByRuleSets.get(ruleSetId).getNotRedactedColor(); } + public float[] getRequestAddColor(String ruleSetId) { + return dictionariesByRuleSets.get(ruleSetId).getRequestAddColor(); } + } \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index c8d235c8..07a0a37a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -349,7 +349,7 @@ public class EntityRedactionService { } - private Set findEntities(SearchableText searchableText, String headline, int sectionNumber, + public Set findEntities(SearchableText searchableText, String headline, int sectionNumber, Dictionary dictionary, boolean local) { Set found = new HashSet<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java new file mode 100644 index 00000000..f868490a --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/ReanalyzeService.java @@ -0,0 +1,281 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service; + +import java.awt.geom.Rectangle2D; +import java.io.ByteArrayInputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.kie.api.runtime.KieContainer; +import org.springframework.stereotype.Service; +import org.springframework.web.bind.annotation.RequestBody; + +import com.iqser.red.service.redaction.v1.model.Comment; +import com.iqser.red.service.redaction.v1.model.IdRemoval; +import com.iqser.red.service.redaction.v1.model.ManualForceRedact; +import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; +import com.iqser.red.service.redaction.v1.model.ManualRedactions; +import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; +import com.iqser.red.service.redaction.v1.model.Rectangle; +import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; +import com.iqser.red.service.redaction.v1.model.SectionArea; +import com.iqser.red.service.redaction.v1.model.SectionText; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.exception.RedactionException; +import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.Dictionary; +import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncrement; +import com.iqser.red.service.redaction.v1.server.redaction.model.Entity; +import com.iqser.red.service.redaction.v1.server.redaction.model.EntityPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection; +import com.iqser.red.service.redaction.v1.server.redaction.model.Section; +import com.iqser.red.service.redaction.v1.server.redaction.model.SectionSearchableTextPair; +import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class ReanalyzeService { + + private final DictionaryService dictionaryService; + private final DroolsExecutionService droolsExecutionService; + private final SurroundingWordsService surroundingWordsService; + private final EntityRedactionService entityRedactionService; + private final RedactionLogCreatorService redactionLogCreatorService; + + + public ReanalyzeResult reanalyze(@RequestBody RenalyzeRequest renalyzeRequest) { + + DictionaryIncrement dictionaryIncrement = dictionaryService.getDictionaryIncrements(renalyzeRequest.getRuleSetId(), renalyzeRequest + .getRedactionLog() + .getDictionaryVersion()); + + Set manualForceAndRemoveIds = getForceAndRemoveIds(renalyzeRequest.getManualRedactions()); + Map> comments = null; + Set manualAdds = null; + + if (renalyzeRequest.getManualRedactions() != null) { + // TODO comments will be removed from redactionLog, so we ignore this first. + comments = renalyzeRequest.getManualRedactions().getComments(); + manualAdds = renalyzeRequest.getManualRedactions().getEntriesToAdd(); + } + + Set sectionsToReanaylse = new HashSet<>(); + for (RedactionLogEntry entry : renalyzeRequest.getRedactionLog().getRedactionLogEntry()) { + if (entry.isManual() || manualForceAndRemoveIds.contains(entry.getId())) { + sectionsToReanaylse.add(entry.getSectionNumber()); + } + } + + for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) { + Set entities = EntitySearchUtils.find(sectionText.getText(), dictionaryIncrement.getValues(), "find", sectionText + .getHeadline(), sectionText.getSectionNumber(), false); + if (!entities.isEmpty()) { + sectionsToReanaylse.add(sectionText.getSectionNumber()); + } + + if (manualAdds != null) { + for (SectionArea sectionArea : sectionText.getSectionAreas()) { + for (ManualRedactionEntry manualAdd : manualAdds) { + for (Rectangle manualPosition : manualAdd.getPositions()) { + if (sectionArea.contains(manualPosition)) { + manualAdd.setSection(sectionText.getHeadline()); + manualAdd.setSectionNumber(sectionText.getSectionNumber()); + } + } + } + } + } + } + + if (sectionsToReanaylse.isEmpty() && (manualAdds == null || manualAdds.isEmpty())) { + renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); + return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build(); + } + + try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(renalyzeRequest.getDocument()))) { + + List reanalysisSections = new ArrayList<>(); + for (SectionText sectionText : renalyzeRequest.getText().getSectionTexts()) { + + if (!sectionsToReanaylse.contains(sectionText.getSectionNumber())) { + continue; + } + + ReanalysisSection reanalysisSection = new ReanalysisSection(); + reanalysisSection.setHeadline(sectionText.getHeadline()); + reanalysisSection.setSectionNumber(sectionText.getSectionNumber()); + List textBlocks = new ArrayList<>(); + + Map> sectionAreasPerPage = new HashMap<>(); + for (SectionArea sectionArea : sectionText.getSectionAreas()) { + sectionAreasPerPage.computeIfAbsent(sectionArea.getPage(), (x) -> new ArrayList<>()) + .add(sectionArea); + } + + Map tabularData = new HashMap<>(); + List cellStarts = new ArrayList<>(); + for (Integer page : sectionAreasPerPage.keySet()) { + List areasOnPage = sectionAreasPerPage.get(page); + + PDPage pdPage = pdDocument.getPage(page - 1); + PDRectangle cropBox = pdPage.getCropBox(); + PDFAreaTextStripper textStripper = new PDFAreaTextStripper(); + textStripper.setPageNumber(page); + + int cellStart = 0; + for (SectionArea sectionArea : areasOnPage) { + + Rectangle2D rect = null; + if (pdPage.getRotation() == 90) { + rect = new Rectangle2D.Float(sectionArea.getTopLeft().getY(), sectionArea.getTopLeft() + .getX(), sectionArea.getHeight(), sectionArea.getWidth() + 0.001f); + } else { + rect = new Rectangle2D.Float(sectionArea.getTopLeft().getX(), -sectionArea.getTopLeft() + .getY() + cropBox.getUpperRightY() - sectionArea.getHeight(), sectionArea.getWidth(), sectionArea + .getHeight() + 0.001f); + } + + textStripper.addRegion(String.valueOf(1), rect); + textStripper.extractRegions(pdPage); + textStripper.getTextForRegion(String.valueOf(1)); + List positions = textStripper.getTextPositionSequences(); + + TextBlock textBlock = new TextBlock(sectionArea.getTopLeft().getX(), sectionArea.getTopLeft() + .getX() + sectionArea.getWidth(), sectionArea.getTopLeft() + .getY(), sectionArea.getTopLeft().getY() + sectionArea.getHeight(), positions, 0); + + if (sectionText.isTable()) { + Cell cell = new Cell(); + cell.addTextBlock(textBlock); + tabularData.put(sectionArea.getHeader(), new CellValue(cell.getTextBlocks(), cellStart)); + cellStarts.add(cellStart); + cellStart = cellStart + cell.toString().trim().length() + 1; + } + + textBlocks.add(textBlock); + textStripper.clearPositions(); + } + + } + reanalysisSection.setTextBlocks(textBlocks); + reanalysisSection.setTabularData(tabularData); + reanalysisSections.add(reanalysisSection); + if (sectionText.isTable()) { + reanalysisSection.setCellStarts(cellStarts); + } + } + + //-- + + KieContainer kieContainer = droolsExecutionService.updateRules(renalyzeRequest.getRuleSetId()); + + Dictionary dictionary = dictionaryService.getDeepCopyDictionary(renalyzeRequest.getRuleSetId()); + + List sectionSearchableTextPairs = new ArrayList<>(); + for (ReanalysisSection reanalysisSection : reanalysisSections) { + + Set entities = entityRedactionService.findEntities(reanalysisSection.getSearchableText(), reanalysisSection + .getHeadline(), reanalysisSection.getSectionNumber(), dictionary, false); + if (reanalysisSection.getCellStarts() != null) { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary, reanalysisSection + .getCellStarts()); + } else { + surroundingWordsService.addSurroundingText(entities, reanalysisSection.getSearchableText(), dictionary); + } + + sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() + .isLocal(false) + .dictionaryTypes(dictionary.getTypes()) + .entities(entities) + .text(reanalysisSection.getSearchableText().getAsStringWithLinebreaks()) + .searchText(reanalysisSection.getSearchableText().toString()) + .headline(reanalysisSection.getHeadline()) + .sectionNumber(reanalysisSection.getSectionNumber()) + .tabularData(reanalysisSection.getTabularData()) + .searchableText(reanalysisSection.getSearchableText()) + .dictionary(dictionary) + .build(), reanalysisSection.getSearchableText())); + } + + Set entities = new HashSet<>(); + sectionSearchableTextPairs.forEach(sectionSearchableTextPair -> { + Section analysedRowSection = droolsExecutionService.executeRules(kieContainer, sectionSearchableTextPair + .getSection()); + entities.addAll(analysedRowSection.getEntities()); + }); + + Map> entitiesPerPage = new HashMap<>(); + for (Entity entity : entities) { + Map> sequenceOnPage = new HashMap<>(); + for (EntityPositionSequence entityPositionSequence : entity.getPositionSequences()) { + sequenceOnPage.computeIfAbsent(entityPositionSequence.getPageNumber(), (x) -> new ArrayList<>()) + .add(entityPositionSequence); + } + + for (Map.Entry> entry : sequenceOnPage.entrySet()) { + entitiesPerPage.computeIfAbsent(entry.getKey(), (x) -> new ArrayList<>()) + .add(new Entity(entity.getWord(), entity.getType(), entity.isRedaction(), entity.getRedactionReason(), entry + .getValue(), entity.getHeadline(), entity.getMatchedRule(), entity.getSectionNumber(), entity + .getLegalBasis(), entity.isDictionaryEntry(), entity.getTextBefore(), entity.getTextAfter(), entity + .getStart(), entity.getEnd())); + } + } + + List newRedactionLogEntries = new ArrayList<>(); + for (int page = 1; page <= pdDocument.getNumberOfPages(); page++) { + if (entitiesPerPage.get(page) != null) { + newRedactionLogEntries.addAll(redactionLogCreatorService.addEntries(entitiesPerPage, renalyzeRequest + .getManualRedactions(), page, renalyzeRequest.getRuleSetId())); + } + newRedactionLogEntries.addAll(redactionLogCreatorService.addManualAddEntries(manualAdds, comments, page, renalyzeRequest + .getRuleSetId())); + } + + Iterator itty = renalyzeRequest.getRedactionLog().getRedactionLogEntry().iterator(); + while (itty.hasNext()) { + RedactionLogEntry entry = itty.next(); + if (sectionsToReanaylse.contains(entry.getSectionNumber()) || entry.getSectionNumber() == 0) { + itty.remove(); + } + } + + renalyzeRequest.getRedactionLog().getRedactionLogEntry().addAll(newRedactionLogEntries); + renalyzeRequest.getRedactionLog().setDictionaryVersion(dictionaryIncrement.getDictionaryVersion()); + + return ReanalyzeResult.builder().redactionLog(renalyzeRequest.getRedactionLog()).build(); + + } catch (Exception e) { + throw new RedactionException(e); + } + + } + + + private Set getForceAndRemoveIds(ManualRedactions manualRedactions) { + + if (manualRedactions == null) { + return new HashSet<>(); + } + + return Stream.concat(manualRedactions.getIdsToRemove() + .stream() + .map(IdRemoval::getId), manualRedactions.getForceRedacts().stream().map(ManualForceRedact::getId)) + .collect(Collectors.toSet()); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java index afea9472..6b5278ae 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionLogCreatorService.java @@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -55,11 +56,11 @@ public class RedactionLogCreatorService { addSectionGrid(classifiedDoc, page); if (classifiedDoc.getEntities().get(page) != null) { - addEntries(classifiedDoc, manualRedactions, page, ruleSetId); + classifiedDoc.getRedactionLogEntities().addAll(addEntries(classifiedDoc.getEntities(), manualRedactions, page, ruleSetId)); } if (manualRedactionPages.contains(page)) { - addManualEntries(classifiedDoc, manualRedactions, page, ruleSetId); + classifiedDoc.getRedactionLogEntities().addAll(addManualAddEntries(manualRedactions.getEntriesToAdd(), manualRedactions.getComments(), page, ruleSetId)); } if (!classifiedDoc.getPages().get(page - 1).getImageBounds().isEmpty()) { @@ -106,13 +107,15 @@ public class RedactionLogCreatorService { } - private void addEntries(Document classifiedDoc, ManualRedactions manualRedactions, int page, String ruleSetId) { + public List addEntries(Map> entities, ManualRedactions manualRedactions, int page, String ruleSetId) { + + List redactionLogEntities = new ArrayList<>(); // Duplicates can exist due table extraction colums over multiple rows. Set processedIds = new HashSet<>(); entityLoop: - for (Entity entity : classifiedDoc.getEntities().get(page)) { + for (Entity entity : entities.get(page)) { List comments = null; @@ -201,10 +204,12 @@ public class RedactionLogCreatorService { // FIXME ids should never be null. Figure out why this happens. if (redactionLogEntry.getId() != null) { - classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); + redactionLogEntities.add(redactionLogEntry); } } } + + return redactionLogEntities; } @@ -233,14 +238,16 @@ public class RedactionLogCreatorService { } - private void addManualEntries(Document classifiedDoc, ManualRedactions manualRedactions, int page, + public List addManualAddEntries(Set manualAdds, Map> comments, int page, String ruleSetId) { - if (manualRedactions == null) { - return; + List redactionLogEntities = new ArrayList<>(); + + if (manualAdds == null) { + return redactionLogEntities; } - for (ManualRedactionEntry manualRedactionEntry : manualRedactions.getEntriesToAdd()) { + for (ManualRedactionEntry manualRedactionEntry : manualAdds) { String id = manualRedactionEntry.getId(); @@ -254,11 +261,13 @@ public class RedactionLogCreatorService { } } - redactionLogEntry.setComments(manualRedactions.getComments().get(id)); + redactionLogEntry.setComments(comments.get(id)); if (!rectanglesOnPage.isEmpty() && !approvedAndShouldBeInDictionary(manualRedactionEntry)) { - classifiedDoc.getRedactionLogEntities().add(redactionLogEntry); + redactionLogEntities.add(redactionLogEntry); } } + + return redactionLogEntities; } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java index a7c4fbba..b6efcb3f 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java @@ -11,10 +11,12 @@ import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizati import lombok.Data; import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; @SuppressWarnings("serial") @Data @EqualsAndHashCode(callSuper = true) +@NoArgsConstructor public class Cell extends Rectangle { private List textBlocks = new ArrayList<>(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 7de575c6..f971651c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -4,8 +4,12 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; import static org.springframework.boot.test.context.SpringBootTest.WebEnvironment.RANDOM_PORT; +import java.awt.Color; +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; import java.io.BufferedReader; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -17,6 +21,7 @@ import java.nio.charset.StandardCharsets; import java.time.OffsetDateTime; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -24,7 +29,15 @@ import java.util.UUID; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.text.PDFTextStripperByArea; +import org.apache.pdfbox.util.Matrix; import org.junit.Before; +import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.kie.api.KieServices; @@ -32,6 +45,7 @@ import org.kie.api.builder.KieBuilder; import org.kie.api.builder.KieFileSystem; import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; +import org.mockito.MockitoAnnotations; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.TestConfiguration; @@ -42,6 +56,7 @@ import org.springframework.test.context.junit4.SpringRunner; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.configuration.v1.api.model.Colors; +import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; import com.iqser.red.service.configuration.v1.api.model.RulesResponse; import com.iqser.red.service.configuration.v1.api.model.TypeResponse; @@ -56,17 +71,28 @@ import com.iqser.red.service.redaction.v1.model.ManualForceRedact; import com.iqser.red.service.redaction.v1.model.ManualRedactionEntry; import com.iqser.red.service.redaction.v1.model.ManualRedactions; import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.ReanalyzeResult; import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; +import com.iqser.red.service.redaction.v1.model.RenalyzeRequest; +import com.iqser.red.service.redaction.v1.model.SectionArea; import com.iqser.red.service.redaction.v1.model.SectionText; import com.iqser.red.service.redaction.v1.model.Status; +import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; import com.iqser.red.service.redaction.v1.server.client.DictionaryClient; import com.iqser.red.service.redaction.v1.server.client.RulesClient; import com.iqser.red.service.redaction.v1.server.controller.RedactionController; +import com.iqser.red.service.redaction.v1.server.exception.RedactionException; +import com.iqser.red.service.redaction.v1.server.parsing.PDFAreaTextStripper; +import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; +import com.iqser.red.service.redaction.v1.server.redaction.model.ReanalysisSection; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; +import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; @RunWith(SpringRunner.class) @SpringBootTest(webEnvironment = RANDOM_PORT) @@ -112,6 +138,7 @@ public class RedactionIntegrationTest { private final Map recommendationTypeMap = new HashMap<>(); private final Map rankTypeMap = new HashMap<>(); private final Colors colors = new Colors(); + private final Map reanlysisVersions = new HashMap<>(); private final static String TEST_RULESET_ID = "123"; @@ -376,7 +403,7 @@ public class RedactionIntegrationTest { return DictionaryResponse.builder() .hexColor(typeColorMap.get(type)) - .entries(dictionary.get(type)) + .entries(toDictionaryEntry(dictionary.get(type))) .isHint(hintTypeMap.get(type)) .isCaseInsensitive(caseInSensitiveMap.get(type)) .isRecommendation(recommendationTypeMap.get(type)) @@ -385,6 +412,15 @@ public class RedactionIntegrationTest { } + private List toDictionaryEntry(List entries){ + List dictionaryEntries = new ArrayList<>(); + entries.forEach(entry -> { + dictionaryEntries.add(new DictionaryEntry(entry, reanlysisVersions.containsKey(entry) ? reanlysisVersions.get(entry) : 0L, false)); + }); + return dictionaryEntries; + } + + @Test public void noExceptionShouldBeThrownForAnyFiles() throws IOException { @@ -414,6 +450,22 @@ public class RedactionIntegrationTest { assertThat(entry.getValue().size()).isEqualTo(1); }); + + dictionary.get(AUTHOR).add("Drinking water"); + when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L); + + long rstart = System.currentTimeMillis(); + ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() + .redactionLog(result.getRedactionLog()) + .document(IOUtils.toByteArray(new FileInputStream(path))) + .manualRedactions(null) + .text(result.getText()) + .ruleSetId(TEST_RULESET_ID) + .build()); + + long rend = System.currentTimeMillis(); + System.out.println("reanalysis analysis duration: " + (rend - rstart)); + } long end = System.currentTimeMillis(); @@ -455,6 +507,86 @@ public class RedactionIntegrationTest { AnalyzeResult result = redactionController.analyze(request); + long end = System.currentTimeMillis(); + + System.out.println("first analysis duration: " + (end - start)); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Test.json")) { + fileOutputStream.write(objectMapper.writeValueAsBytes(result.getText())); + } + + int correctFound = 0; + loop: + for (RedactionLogEntry redactionLogEntry : result.getRedactionLog().getRedactionLogEntry()) { + for (SectionText sectionText : result.getText().getSectionTexts()) { + if (redactionLogEntry.getType().equals("image")) { + correctFound++; + continue loop; + } + if (redactionLogEntry.getSectionNumber() == sectionText.getSectionNumber()) { + String value = sectionText.getText() + .substring(redactionLogEntry.getStartOffset(), redactionLogEntry.getEndOffset()); + if (redactionLogEntry.getValue().equalsIgnoreCase(value)) { + correctFound++; + } else { + throw new RuntimeException("WTF"); + } + } + } + } + assertThat(correctFound).isEqualTo(result.getRedactionLog().getRedactionLogEntry().size()); + + + dictionary.get(AUTHOR).add("properties"); + reanlysisVersions.put("properties", 1L); + + dictionary.get(AUTHOR).add("physical"); + reanlysisVersions.put("physical", 2L); + + when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(2L); + when(dictionaryClient.getDictionaryForType(AUTHOR, TEST_RULESET_ID)).thenReturn(getDictionaryResponse(AUTHOR)); + + start = System.currentTimeMillis(); + ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() + .redactionLog(result.getRedactionLog()) + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .manualRedactions(null) + .text(result.getText()) + .ruleSetId(TEST_RULESET_ID) + .build()); + + end = System.currentTimeMillis(); + System.out.println("reanalysis analysis duration: " + (end - start)); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .redactionLog(reanalyzeResult.getRedactionLog()) + .sectionGrid(result.getSectionGrid()) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + } + + + + @Test + @Ignore + public void fillRecanTest() throws IOException { + + System.out.println("redactionTest"); + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource("files/S5.pdf"); + + AnalyzeRequest request = AnalyzeRequest.builder() + .ruleSetId(TEST_RULESET_ID) + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .build(); + + AnalyzeResult result = redactionController.analyze(request); + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) .redactionLog(result.getRedactionLog()) @@ -496,9 +628,70 @@ public class RedactionIntegrationTest { System.out.println("duration: " + (end - start)); System.out.println("numberOfPages: " + result.getNumberOfPages()); + + SectionArea sectionArea = result.getText().getSectionTexts().get(3).getSectionAreas().get(5); + + try (PDDocument pdDocument = PDDocument.load(new ByteArrayInputStream(IOUtils.toByteArray(pdfFileResource.getInputStream())))) { + + PDPage docPage = pdDocument.getPage(0); + + PDFTextStripperByArea textStripper = new PDFTextStripperByArea(); + + PDRectangle cropBox = docPage.getCropBox(); + PDRectangle mediaBox = docPage.getMediaBox(); + + +// if (textPositions.get(0).getRotation() == 90) { +// posXEnd = textPositions.get(0).getYDirAdj() + 2; +// posYInit = getY1(); +// posYEnd = textPositions.get(textPositions.size() - 1).getXDirAdj() - height + 4; +// } else { +// posXEnd = textPositions.get(textPositions.size() - 1) +// .getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidth() + 1; +// posYInit = textPositions.get(0).getPageHeight() - textPositions.get(0).getYDirAdj() - 2; +// posYEnd = textPositions.get(0).getPageHeight() - textPositions.get(textPositions.size() - 1) +// .getYDirAdj() + 2; +// } + + + Rectangle2D rect = new Rectangle2D.Float(sectionArea.getTopLeft() + .getY(), sectionArea.getTopLeft() + .getX() , sectionArea.getHeight(), sectionArea + .getWidth() + 0.001f); + + textStripper.addRegion("region", rect); + + + + textStripper.extractRegions(docPage); + + String textForRegion = textStripper.getTextForRegion("region"); + + System.out.println(textForRegion); + + // fill a rectangle + PDPageContentStream contents = new PDPageContentStream (pdDocument, docPage, PDPageContentStream.AppendMode.APPEND, false, false); + contents.setNonStrokingColor (Color.RED); + contents.addRect (sectionArea.getTopLeft().getX(), sectionArea.getTopLeft().getY(), sectionArea.getWidth(), sectionArea.getHeight()); + contents.fill (); + contents.close (); + try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) { + pdDocument.save(byteArrayOutputStream); + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated2.pdf")) { + fileOutputStream.write(byteArrayOutputStream.toByteArray()); + } + } + + } catch (Exception e) { + throw new RedactionException(e); + } + } + + + @Test public void testTableRedaction() throws IOException { @@ -569,7 +762,7 @@ public class RedactionIntegrationTest { manualRedactionEntry.setReason("Manual Redaction"); manualRedactionEntry.setPositions(List.of(new Rectangle(new Point(375.61096f, 241.282f), 7.648041f, 43.72262f, 1), new Rectangle(new Point(384.83517f, 241.282f), 7.648041f, 17.043358f, 1))); - manualRedactions.getEntriesToAdd().add(manualRedactionEntry); +// manualRedactions.getEntriesToAdd().add(manualRedactionEntry); AnalyzeRequest request = AnalyzeRequest.builder() .ruleSetId(TEST_RULESET_ID) @@ -579,9 +772,25 @@ public class RedactionIntegrationTest { AnalyzeResult result = redactionController.analyze(request); + manualRedactions.getEntriesToAdd().add(manualRedactionEntry); + manualRedactions.setIdsToRemove(Set.of(IdRemoval.builder() + .id("5b940b2cb401ed9f5be6fc24f6e77bcf") + .status(Status.APPROVED) + .build())); + + + ReanalyzeResult reanalyzeResult = redactionController.reanalyze(RenalyzeRequest.builder() + .redactionLog(result.getRedactionLog()) + .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) + .manualRedactions(manualRedactions) + .text(result.getText()) + .ruleSetId(TEST_RULESET_ID) + .build()); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() .document(IOUtils.toByteArray(pdfFileResource.getInputStream())) - .redactionLog(result.getRedactionLog()) + .redactionLog(reanalyzeResult.getRedactionLog()) .sectionGrid(result.getSectionGrid()) .build()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java index fb2fae12..b7efed93 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionServiceTest.java @@ -1,6 +1,7 @@ package com.iqser.red.service.redaction.v1.server.redaction.service; import com.iqser.red.service.configuration.v1.api.model.Colors; +import com.iqser.red.service.configuration.v1.api.model.DictionaryEntry; import com.iqser.red.service.configuration.v1.api.model.DictionaryResponse; import com.iqser.red.service.configuration.v1.api.model.RulesResponse; import com.iqser.red.service.configuration.v1.api.model.TypeResponse; @@ -129,12 +130,12 @@ public class EntityRedactionServiceTest { .build(); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) + .entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))) .build(); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) + .entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); @@ -162,12 +163,12 @@ public class EntityRedactionServiceTest { .build(); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H.")) + .entries(toDictionaryEntry(Arrays.asList("Casey, H.W.", "O’Loughlin, C.K.", "Salamon, C.M.", "Smith, S.H."))) .build(); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA")) + .entries(toDictionaryEntry(Collections.singletonList("Toxigenics, Inc., Decatur, IL 62526, USA"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() @@ -191,11 +192,11 @@ public class EntityRedactionServiceTest { " Supplement - Identity of the active substance - Reference list.pdf"); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() @@ -228,15 +229,15 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Row With Ambiguous Redaction.pdf"); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(sponsorResponse); try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { @@ -297,11 +298,11 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Applicant Producer Table.pdf"); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_author.txt")))) .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_address.txt")))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); DictionaryResponse sponsorResponse = DictionaryResponse.builder() @@ -346,7 +347,7 @@ public class EntityRedactionServiceTest { .build(); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(authorResponse); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt"))) + .entries(toDictionaryEntry(new ArrayList<>(ResourceLoader.load("dictionaries/CBI_sponsor.txt")))) .build(); when(dictionaryClient.getDictionaryForType(SPONSOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); try (PDDocument pdDocument = PDDocument.load(pdfFileResource.getInputStream())) { @@ -367,13 +368,13 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation.pdf"); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(Arrays.asList("Bissig R.", "Thanei P.")) + .entries(toDictionaryEntry(Arrays.asList("Bissig R.", "Thanei P."))) .build(); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) + .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); @@ -392,13 +393,13 @@ public class EntityRedactionServiceTest { pdfFileResource = new ClassPathResource("files/Minimal Examples/Header Propagation2.pdf"); dictionaryResponse = DictionaryResponse.builder() - .entries(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C.")) + .entries(toDictionaryEntry(Arrays.asList("Tribolet, R.", "Muir, G.", "Kühne-Thu, H.", "Close, C."))) .build(); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) + .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); @@ -419,13 +420,13 @@ public class EntityRedactionServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Empty Tabular Data.pdf"); DictionaryResponse dictionaryResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Aldershof S.")) + .entries(toDictionaryEntry(Collections.singletonList("Aldershof S."))) .build(); when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(DICTIONARY_VERSION.incrementAndGet()); when(dictionaryClient.getDictionaryForType(AUTHOR_CODE, TEST_RULESET_ID)).thenReturn(dictionaryResponse); DictionaryResponse addressResponse = DictionaryResponse.builder() - .entries(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland")) + .entries(toDictionaryEntry(Collections.singletonList("Novartis Crop Protection AG, Basel, Switzerland"))) .build(); when(dictionaryClient.getDictionaryForType(ADDRESS_CODE, TEST_RULESET_ID)).thenReturn(addressResponse); @@ -517,4 +518,12 @@ public class EntityRedactionServiceTest { } } + private List toDictionaryEntry(List entries){ + List dictionaryEntries = new ArrayList<>(); + entries.forEach(entry -> { + dictionaryEntries.add(new DictionaryEntry(entry, 1L, false)); + }); + return dictionaryEntries; + } + } \ No newline at end of file