Compare commits
2 Commits
master
...
extended-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1db515acf0 | ||
|
|
f9e7703063 |
@ -61,7 +61,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||||
implementation("org.ahocorasick:ahocorasick:0.6.3")
|
implementation("org.ahocorasick:ahocorasick:0.7.3")
|
||||||
implementation("org.javassist:javassist:3.29.2-GA")
|
implementation("org.javassist:javassist:3.29.2-GA")
|
||||||
|
|
||||||
implementation("org.drools:drools-engine:${droolsVersion}")
|
implementation("org.drools:drools-engine:${droolsVersion}")
|
||||||
@ -129,6 +129,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
|||||||
"BPE_APPEND_JAVA_TOOL_OPTIONS",
|
"BPE_APPEND_JAVA_TOOL_OPTIONS",
|
||||||
"-XX:MaxMetaspaceSize=1g -Dfile.encoding=UTF-8 -Dkie.repository.project.cache.size=50 -Dkie.repository.project.versions.cache.size=5"
|
"-XX:MaxMetaspaceSize=1g -Dfile.encoding=UTF-8 -Dkie.repository.project.cache.size=50 -Dkie.repository.project.versions.cache.size=5"
|
||||||
)
|
)
|
||||||
|
environment.put("BPE_DEFAULT_LANG", "en_US.utf8")
|
||||||
environment.put("BPE_DEFAULT_LANG", "en_US.utf8") // java.text.Normalizer does not care for file.encoding
|
environment.put("BPE_DEFAULT_LANG", "en_US.utf8") // java.text.Normalizer does not care for file.encoding
|
||||||
|
|
||||||
imageName.set("nexus.knecon.com:5001/red/${project.name}")// must build image with same name always, otherwise the builder will not know which image to use as cache. DO NOT CHANGE!
|
imageName.set("nexus.knecon.com:5001/red/${project.name}")// must build image with same name always, otherwise the builder will not know which image to use as cache. DO NOT CHANGE!
|
||||||
|
|||||||
@ -19,6 +19,7 @@ import lombok.Data;
|
|||||||
public class SearchImplementation {
|
public class SearchImplementation {
|
||||||
|
|
||||||
private boolean ignoreCase;
|
private boolean ignoreCase;
|
||||||
|
private boolean ignoreWhiteSpace;
|
||||||
private List<String> values;
|
private List<String> values;
|
||||||
|
|
||||||
private Pattern pattern;
|
private Pattern pattern;
|
||||||
@ -41,13 +42,22 @@ public class SearchImplementation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SearchImplementation(Collection<String> values, boolean ignoreCase, boolean ignoreWhiteSpace) {
|
||||||
|
|
||||||
|
this.values = new ArrayList<>(values);
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
|
this.ignoreWhiteSpace = ignoreWhiteSpace;
|
||||||
|
this.createSearchImplementation();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void createSearchImplementation() {
|
private void createSearchImplementation() {
|
||||||
|
|
||||||
if (this.values.isEmpty()) {
|
if (this.values.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.values.size() == 1) {
|
if (this.values.size() == 1 && !this.ignoreWhiteSpace) {
|
||||||
var text = this.values.iterator().next();
|
var text = this.values.iterator().next();
|
||||||
|
|
||||||
if (this.ignoreCase) {
|
if (this.ignoreCase) {
|
||||||
@ -60,6 +70,9 @@ public class SearchImplementation {
|
|||||||
if (this.ignoreCase) {
|
if (this.ignoreCase) {
|
||||||
builder.ignoreCase();
|
builder.ignoreCase();
|
||||||
}
|
}
|
||||||
|
if (this.ignoreWhiteSpace) {
|
||||||
|
builder.ignoreWhiteSpace();
|
||||||
|
}
|
||||||
|
|
||||||
builder.addKeywords(this.values);
|
builder.addKeywords(this.values);
|
||||||
|
|
||||||
|
|||||||
@ -264,7 +264,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||||
|
|
||||||
return getLineBreaks().stream()
|
return getLineBreaks().stream()
|
||||||
.map(linebreak -> linebreak + this.textRange.start())
|
.map(linebreak -> linebreak + this.textRange.start())
|
||||||
|
|||||||
@ -0,0 +1,126 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.model.document.textblock;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class PartialTextBlock implements TextBlock {
|
||||||
|
|
||||||
|
AtomicTextBlock atomicTextBlock;
|
||||||
|
TextRange textRange;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSearchText() {
|
||||||
|
|
||||||
|
return atomicTextBlock.getSearchText().substring(textRange.start(), textRange.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSearchTextLowerCase() {
|
||||||
|
|
||||||
|
return atomicTextBlock.getSearchTextLowerCase().substring(textRange.start(), textRange.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> getWords() {
|
||||||
|
|
||||||
|
String searchText = getSearchText();
|
||||||
|
|
||||||
|
List<String> words = new ArrayList<>();
|
||||||
|
BreakIterator iterator = BreakIterator.getWordInstance(Locale.ENGLISH);
|
||||||
|
iterator.setText(getSearchText());
|
||||||
|
int start = iterator.first();
|
||||||
|
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
|
||||||
|
words.add(searchText.substring(start, end));
|
||||||
|
}
|
||||||
|
return words;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||||
|
|
||||||
|
return List.of(atomicTextBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getNextLinebreak(int fromIndex) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getNextLinebreak(fromIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getPreviousLinebreak(int fromIndex) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getPreviousLinebreak(fromIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextRange getLineTextRange(int lineNumber) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getLineTextRange(lineNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Integer> getLineBreaks() {
|
||||||
|
|
||||||
|
return atomicTextBlock.getAllLineBreaksInBoundary(textRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Rectangle2D getPosition(int stringIdx) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getPosition(stringIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getPositions(stringTextRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||||
|
|
||||||
|
return atomicTextBlock.getPositionsPerPage(stringTextRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String subSequenceWithLineBreaks(TextRange textRange) {
|
||||||
|
|
||||||
|
return atomicTextBlock.subSequenceWithLineBreaks(textRange);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numberOfLines() {
|
||||||
|
|
||||||
|
return getLineBreaks().size() + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -19,6 +19,7 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog
|
|||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntryState;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Position;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ChangeFactory;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ChangeFactory;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.RedactionServiceSettings;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
|
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
|
||||||
|
|
||||||
@ -33,6 +34,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class NotFoundImportedEntitiesService {
|
public class NotFoundImportedEntitiesService {
|
||||||
|
|
||||||
public static final String IMPORTED_REDACTION_TYPE = "imported_redaction";
|
public static final String IMPORTED_REDACTION_TYPE = "imported_redaction";
|
||||||
|
private final RedactionServiceSettings settings;
|
||||||
|
|
||||||
|
|
||||||
@Timed("redactmanager_processEntityLog")
|
@Timed("redactmanager_processEntityLog")
|
||||||
@ -95,7 +97,7 @@ public class NotFoundImportedEntitiesService {
|
|||||||
}
|
}
|
||||||
entityLogEntry.getImportedRedactionIntersections().add(precursorEntity.getId());
|
entityLogEntry.getImportedRedactionIntersections().add(precursorEntity.getId());
|
||||||
|
|
||||||
if (entityLogEntry.getState() != EntryState.REMOVED) {
|
if (entityLogEntry.getState() != EntryState.REMOVED && !settings.isAnnotationMode()) {
|
||||||
entityLogEntry.setState(EntryState.REMOVED);
|
entityLogEntry.setState(EntryState.REMOVED);
|
||||||
entityLogEntry.getChanges().add(ChangeFactory.toChange(ChangeType.REMOVED, OffsetDateTime.now(), analysisNumber));
|
entityLogEntry.getChanges().add(ChangeFactory.toChange(ChangeType.REMOVED, OffsetDateTime.now(), analysisNumber));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -348,16 +348,7 @@ public class ComponentCreationService {
|
|||||||
*/
|
*/
|
||||||
public void create(String ruleIdentifier, String name, String value, String valueDescription, Entity reference) {
|
public void create(String ruleIdentifier, String name, String value, String valueDescription, Entity reference) {
|
||||||
|
|
||||||
referencedEntities.add(reference);
|
create(ruleIdentifier, name, value, valueDescription, List.of(reference));
|
||||||
List<Entity> referenceList = new LinkedList<>();
|
|
||||||
referenceList.add(reference);
|
|
||||||
kieSession.insert(Component.builder()
|
|
||||||
.matchedRule(RuleIdentifier.fromString(ruleIdentifier))
|
|
||||||
.name(name)
|
|
||||||
.value(value)
|
|
||||||
.valueDescription(valueDescription)
|
|
||||||
.references(referenceList)
|
|
||||||
.build());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import static java.lang.String.format;
|
|||||||
import static java.util.stream.Collectors.groupingBy;
|
import static java.util.stream.Collectors.groupingBy;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -14,6 +15,7 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
@ -22,14 +24,18 @@ import com.iqser.red.service.redaction.v1.server.model.ClosestEntity;
|
|||||||
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
import com.iqser.red.service.redaction.v1.server.model.PrecursorEntity;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
|
import com.iqser.red.service.redaction.v1.server.model.RectangleWithPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
import com.iqser.red.service.redaction.v1.server.model.dictionary.SearchImplementation;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
import com.iqser.red.service.redaction.v1.server.model.document.entity.EntityType;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionOnPage;
|
import com.iqser.red.service.redaction.v1.server.model.document.entity.PositionOnPage;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.PartialTextBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -37,6 +43,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Service
|
@Service
|
||||||
public class EntityFindingUtility {
|
public class EntityFindingUtility {
|
||||||
|
|
||||||
|
private static final float STRING_SIMILARITY_THRESHOLD = 0.2f;
|
||||||
EntityCreationService entityCreationService;
|
EntityCreationService entityCreationService;
|
||||||
|
|
||||||
|
|
||||||
@ -55,7 +62,7 @@ public class EntityFindingUtility {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<TextEntity> possibleEntities = entitiesWithSameValue.get(precursorEntity.getValue().toLowerCase(Locale.ENGLISH));
|
List<TextEntity> possibleEntities = entitiesWithSameValue.get(TextNormalizationUtilities.removeAllWhitespaces(precursorEntity.getValue().toLowerCase(Locale.ENGLISH)));
|
||||||
|
|
||||||
if (entityIdentifierValueNotFound(possibleEntities)) {
|
if (entityIdentifierValueNotFound(possibleEntities)) {
|
||||||
log.info("Entity could not be created with precursorEntity: {}, due to the value {} not being found anywhere.", precursorEntity, precursorEntity.getValue());
|
log.info("Entity could not be created with precursorEntity: {}, due to the value {} not being found anywhere.", precursorEntity, precursorEntity.getValue());
|
||||||
@ -91,6 +98,79 @@ public class EntityFindingUtility {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<TextEntity> findEntityByUnderlyingText(PrecursorEntity precursorEntity, Document document) {
|
||||||
|
|
||||||
|
if (precursorEntity.getEntityPosition().isEmpty()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
Optional<Page> optionalPage = document.getPages()
|
||||||
|
.stream()
|
||||||
|
.filter(docPage -> docPage.getNumber()
|
||||||
|
.equals(precursorEntity.getEntityPosition()
|
||||||
|
.get(0).pageNumber()))
|
||||||
|
.findFirst();
|
||||||
|
|
||||||
|
if (optionalPage.isEmpty()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
Page page = optionalPage.get();
|
||||||
|
Rectangle2D rect = precursorEntity.getEntityPosition()
|
||||||
|
.stream()
|
||||||
|
.map(RectangleWithPage::rectangle2D)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
Optional<AtomicTextBlock> intersectingTbOptional = page.getTextBlocksOnPage()
|
||||||
|
.stream()
|
||||||
|
.filter(tb -> RectangleTransformations.rectangle2DBBox(tb.getPositions()).intersects(rect))
|
||||||
|
.findFirst();
|
||||||
|
if (intersectingTbOptional.isEmpty()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
AtomicTextBlock intersectingTb = intersectingTbOptional.get();
|
||||||
|
List<PartialTextBlock> underlyingTextRuns = findUnderlyingCharacterRuns(intersectingTb, rect);
|
||||||
|
|
||||||
|
for (PartialTextBlock underlyingText : underlyingTextRuns) {
|
||||||
|
int threshold = (int) (Math.min(underlyingText.length(), precursorEntity.length()) * STRING_SIMILARITY_THRESHOLD) + 1;
|
||||||
|
int distance = new LevenshteinDistance(threshold).apply(underlyingText.getSearchText(), precursorEntity.getValue());
|
||||||
|
if (distance >= 0) {
|
||||||
|
return entityCreationService.byTextRangeWithEngine(underlyingText.getTextRange(), "temp", EntityType.ENTITY, document, Collections.emptySet());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<PartialTextBlock> findUnderlyingCharacterRuns(AtomicTextBlock intersectingTb, Rectangle2D rect) {
|
||||||
|
|
||||||
|
List<PartialTextBlock> intersectingTextBlocks = new ArrayList<>();
|
||||||
|
int first = -1;
|
||||||
|
int last = -1;
|
||||||
|
|
||||||
|
for (int i = 0; i < intersectingTb.getPositions().size(); i++) {
|
||||||
|
Rectangle2D rectangle2D = intersectingTb.getPosition(i + intersectingTb.getTextRange().start());
|
||||||
|
|
||||||
|
if (rectangle2D.intersects(rect)) {
|
||||||
|
if (first == -1) {
|
||||||
|
first = i;
|
||||||
|
}
|
||||||
|
last = i;
|
||||||
|
} else if (first != -1) {
|
||||||
|
intersectingTextBlocks.add(new PartialTextBlock(intersectingTb, new TextRange(first, last)));
|
||||||
|
|
||||||
|
first = -1;
|
||||||
|
last = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (first != -1) {
|
||||||
|
intersectingTextBlocks.add(new PartialTextBlock(intersectingTb, new TextRange(first, last)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return intersectingTextBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean entityIdentifierValueNotFound(List<TextEntity> possibleEntities) {
|
private static boolean entityIdentifierValueNotFound(List<TextEntity> possibleEntities) {
|
||||||
|
|
||||||
return possibleEntities == null || possibleEntities.isEmpty();
|
return possibleEntities == null || possibleEntities.isEmpty();
|
||||||
@ -183,7 +263,7 @@ public class EntityFindingUtility {
|
|||||||
|
|
||||||
SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream()
|
SearchImplementation searchImplementation = new SearchImplementation(entryValues.stream()
|
||||||
.map(String::trim)
|
.map(String::trim)
|
||||||
.collect(Collectors.toSet()), true);
|
.collect(Collectors.toSet()), true, true);
|
||||||
|
|
||||||
List<TextBlock> textBlocks = node.getTextBlocksByPageNumbers(pageNumbers);
|
List<TextBlock> textBlocks = node.getTextBlocksByPageNumbers(pageNumbers);
|
||||||
|
|
||||||
@ -193,7 +273,7 @@ public class EntityFindingUtility {
|
|||||||
.filter(Optional::isPresent)
|
.filter(Optional::isPresent)
|
||||||
.map(Optional::get)
|
.map(Optional::get)
|
||||||
.distinct()
|
.distinct()
|
||||||
.collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT)));
|
.collect(groupingBy(entity -> TextNormalizationUtilities.removeAllWhitespaces(entity.getValue().toLowerCase(Locale.ROOT))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class EntityFromPrecursorCreationService {
|
public class EntityFromPrecursorCreationService {
|
||||||
|
|
||||||
static double MATCH_THRESHOLD = 10; // Is compared to the average sum of distances in pdf coordinates for each corner of the bounding box of the entities
|
static double MATCH_THRESHOLD = 100; // Is compared to the average sum of distances in pdf coordinates for each corner of the bounding box of the entities
|
||||||
EntityFindingUtility entityFindingUtility;
|
EntityFindingUtility entityFindingUtility;
|
||||||
DictionaryService dictionaryService;
|
DictionaryService dictionaryService;
|
||||||
RedactionServiceSettings settings;
|
RedactionServiceSettings settings;
|
||||||
@ -130,10 +130,8 @@ public class EntityFromPrecursorCreationService {
|
|||||||
} else {
|
} else {
|
||||||
String section = precursorEntity.getManualOverwrite().getSection()
|
String section = precursorEntity.getManualOverwrite().getSection()
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
if ((section == null || section.isBlank())
|
if ((section == null || section.isBlank()) && precursorEntity.getSection() != null && !precursorEntity.getSection().isBlank() && precursorEntity.getEngines()
|
||||||
&& precursorEntity.getSection() != null
|
.contains(Engine.IMPORTED)) {
|
||||||
&& !precursorEntity.getSection().isBlank()
|
|
||||||
&& precursorEntity.getEngines().contains(Engine.IMPORTED)) {
|
|
||||||
section = precursorEntity.getSection();
|
section = precursorEntity.getSection();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,31 +1,47 @@
|
|||||||
package com.iqser.red.service.redaction.v1.server.utils;
|
package com.iqser.red.service.redaction.v1.server.utils;
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public final class TextNormalizationUtilities {
|
public final class TextNormalizationUtilities {
|
||||||
|
|
||||||
/**
|
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
||||||
* Revert hyphenation due to line breaks.
|
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
||||||
*
|
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
||||||
* @param text Text to be processed.
|
public static final Pattern WHITESPACE_REMOVAL = Pattern.compile("\\s+");
|
||||||
* @return Text without line-break hyphenation.
|
|
||||||
*/
|
|
||||||
public static String removeHyphenLineBreaks(String text) {
|
|
||||||
|
|
||||||
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
|
|
||||||
|
public String cleanString(String value) {
|
||||||
|
|
||||||
|
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
|
||||||
|
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
|
||||||
|
return removeMultipleWhitespaces(noLinebreaks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String removeLineBreaks(String text) {
|
public String removeHyphenLinebreaks(String value) {
|
||||||
|
|
||||||
return text.replaceAll("\n", " ");
|
return hyphenLineBreaks.matcher(value).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String removeRepeatingWhitespaces(String text) {
|
private String removeMultipleWhitespaces(String value) {
|
||||||
|
|
||||||
return text.replaceAll(" {2}", " ");
|
return doubleWhitespaces.matcher(value).replaceAll(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String removeLinebreaks(String value) {
|
||||||
|
|
||||||
|
return linebreaks.matcher(value).replaceAll(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String removeAllWhitespaces(String value) {
|
||||||
|
|
||||||
|
return WHITESPACE_REMOVAL.matcher(value).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -611,7 +611,7 @@ public abstract class AbstractRedactionIntegrationTest {
|
|||||||
|
|
||||||
private String cleanDictionaryEntry(String entry) {
|
private String cleanDictionaryEntry(String entry) {
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(entry).replaceAll("\\n", " ");
|
return TextNormalizationUtilities.cleanString(entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -6,8 +6,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
import static org.mockito.ArgumentMatchers.any;
|
import static org.mockito.ArgumentMatchers.any;
|
||||||
import static org.mockito.ArgumentMatchers.anyString;
|
import static org.mockito.ArgumentMatchers.anyString;
|
||||||
import static org.mockito.Mockito.times;
|
|
||||||
import static org.mockito.Mockito.verify;
|
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
@ -37,7 +35,6 @@ import org.junit.jupiter.api.Order;
|
|||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.TestMethodOrder;
|
import org.junit.jupiter.api.TestMethodOrder;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
|
||||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||||
import org.springframework.boot.test.context.SpringBootTest;
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
@ -83,7 +80,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
|||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
||||||
import com.iqser.red.service.redaction.v1.server.rules.RulesIntegrationTest;
|
import com.iqser.red.service.redaction.v1.server.rules.RulesIntegrationTest;
|
||||||
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
|
import com.iqser.red.service.redaction.v1.server.service.document.DocumentGraphMapper;
|
||||||
import com.iqser.red.service.redaction.v1.server.storage.DocumentDataFallbackService;
|
|
||||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||||
@ -102,7 +98,6 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
|
|||||||
|
|
||||||
private static final String RULES = loadFromClassPath("drools/rules.drl");
|
private static final String RULES = loadFromClassPath("drools/rules.drl");
|
||||||
|
|
||||||
|
|
||||||
@Configuration
|
@Configuration
|
||||||
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
@EnableAutoConfiguration(exclude = {RabbitAutoConfiguration.class})
|
||||||
@Import({LayoutParsingServiceProcessorConfiguration.class})
|
@Import({LayoutParsingServiceProcessorConfiguration.class})
|
||||||
@ -1233,6 +1228,44 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
public void testImportedRedactions2() throws IOException {
|
||||||
|
|
||||||
|
String outputFileName = OsUtils.getTemporaryDirectory() + "/ImportedRedactions.pdf";
|
||||||
|
ClassPathResource importedRedactions = new ClassPathResource("files/ImportedRedactions/76c5683ebc8c19dc23eccea12dfc652b.IMPORTED_REDACTIONS.json");
|
||||||
|
|
||||||
|
AnalyzeRequest request = uploadFileToStorage("files/ImportedRedactions/76c5683ebc8c19dc23eccea12dfc652b.ORIGIN.pdf");
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(),
|
||||||
|
StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS),
|
||||||
|
importedRedactions.getInputStream());
|
||||||
|
|
||||||
|
analyzeDocumentStructure(LayoutParsingType.DOCUMINE_OLD, request);
|
||||||
|
AnalyzeResult result = analyzeService.analyze(request);
|
||||||
|
|
||||||
|
var entityLog = redactionStorageService.getEntityLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||||
|
|
||||||
|
AnnotateResponse annotateResponse = annotationService.annotate(AnnotateRequest.builder().dossierId(TEST_DOSSIER_ID).fileId(TEST_FILE_ID).build());
|
||||||
|
|
||||||
|
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
|
||||||
|
fileOutputStream.write(annotateResponse.getDocument());
|
||||||
|
}
|
||||||
|
|
||||||
|
entityLog.getEntityLogEntry()
|
||||||
|
.forEach(entry -> {
|
||||||
|
if (entry.getValue() == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (entry.getValue().equals("David")) {
|
||||||
|
assertThat(entry.getImportedRedactionIntersections()).hasSize(1);
|
||||||
|
}
|
||||||
|
if (entry.getValue().equals("annotation")) {
|
||||||
|
assertThat(entry.getImportedRedactionIntersections()).isEmpty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testExpandByPrefixRegEx() throws IOException {
|
public void testExpandByPrefixRegEx() throws IOException {
|
||||||
|
|
||||||
@ -2252,9 +2285,7 @@ public class RedactionIntegrationTest extends RulesIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testFileWithImagesAndNoText() {
|
public void testFileWithImagesAndNoText() {
|
||||||
|
|
||||||
AnalyzeRequest request = prepareStorage("files/new/only_images.pdf",
|
AnalyzeRequest request = prepareStorage("files/new/only_images.pdf", "files/cv_service_empty_response.json", "files/only_images_file_image_response.json");
|
||||||
"files/cv_service_empty_response.json",
|
|
||||||
"files/only_images_file_image_response.json");
|
|
||||||
|
|
||||||
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
||||||
AnalyzeResult result = analyzeService.analyze(request);
|
AnalyzeResult result = analyzeService.analyze(request);
|
||||||
|
|||||||
@ -11,10 +11,10 @@ public class TextNormalizationUtilitiesTest {
|
|||||||
public void testHyphenRemoval() {
|
public void testHyphenRemoval() {
|
||||||
|
|
||||||
String test = "Without these peo-\nple, this conference would not happen";
|
String test = "Without these peo-\nple, this conference would not happen";
|
||||||
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)).contains("people");
|
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLinebreaks(test)).contains("people");
|
||||||
|
|
||||||
test = "Die\t\nFreiwillige\t Versicherung\t endet\t zudem\t für\t den\t ein\u00AD\nzelnen\tVersicherten\tmit\tder\tAufhebung\tdes\tVertra-\nges,\t seiner\t Unterstellung\t unter\t die\t obligatorische\t\nVersicherung\t oder\t seinem\t Ausschluss.";
|
test = "Die\t\nFreiwillige\t Versicherung\t endet\t zudem\t für\t den\t ein\u00AD\nzelnen\tVersicherten\tmit\tder\tAufhebung\tdes\tVertra-\nges,\t seiner\t Unterstellung\t unter\t die\t obligatorische\t\nVersicherung\t oder\t seinem\t Ausschluss.";
|
||||||
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLineBreaks(test)).contains("einzelnen", "Vertrages");
|
Assertions.assertThat(TextNormalizationUtilities.removeHyphenLinebreaks(test)).contains("einzelnen", "Vertrages");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user