RED-9221 - Fix component extraction in tables where rows and columns are not 1:1 #418

Merged
andrei.isvoran.ext merged 1 commits from RED-9221 into master 2024-06-06 10:31:38 +02:00
13 changed files with 246 additions and 108 deletions

View File

@ -4,10 +4,11 @@ plugins {
}
description = "redaction-service-api-v1"
val persistenceServiceVersion = "2.439.0"
dependencies {
implementation("org.springframework:spring-web:6.0.12")
implementation("com.iqser.red.service:persistence-service-internal-api-v1:2.411.0")
implementation("com.iqser.red.service:persistence-service-internal-api-v1:${persistenceServiceVersion}")
}
publishing {

View File

@ -16,7 +16,7 @@ val layoutParserVersion = "0.131.0"
val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"
val persistenceServiceVersion = "2.429.0"
val persistenceServiceVersion = "2.439.0"
val springBootStarterVersion = "3.1.5"
val springCloudVersion = "4.0.4"
val testContainersVersion = "1.19.7"

View File

@ -68,7 +68,7 @@ public class Entity {
Set<String> importedRedactionIntersections;
public static Entity fromEntityLogEntry(EntityLogEntry e, Document document) {
public static Entity fromEntityLogEntry(EntityLogEntry e, Document document, int startOffset, int endOffset) {
return Entity.builder()
.id(e.getId())
@ -86,8 +86,8 @@ public class Entity {
.containingNode(document.getDocumentTree().getEntryById(e.getContainingNodeId()).getNode())
.textBefore(e.getTextBefore())
.textAfter(e.getTextAfter())
.startOffset(e.getStartOffset())
.endOffset(e.getEndOffset())
.startOffset(startOffset)
.endOffset(endOffset)
.length(Optional.ofNullable(e.getValue())
.orElse("").length())
.imageHasTransparency(e.isImageHasTransparency())

View File

@ -166,6 +166,42 @@ public class DocumentTree {
}
public Optional<TableCell> findTableCellInTable(List<Integer> treeId, int start, int end) {
return findTableCellInTableRecursively(getEntryById(treeId).getChildren(), start, end);
}
private Optional<TableCell> findTableCellInTableRecursively(List<Entry> entries, int start, int end) {
int startIdx = findFirstIdxOfContainingChildBinarySearch(entries, start);
if (startIdx < 0) {
return Optional.empty();
}
Entry entry = entries.get(startIdx);
if (entry.getNode().getTextRange().contains(end) && entry.getNode() instanceof TableCell tableCell) {
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.of(tableCell);
}
if (!entry.getNode().isLeaf()) {
Optional<TableCell> foundInChildren = findTableCellInTableRecursively(entry.getChildren(), start, end);
if (foundInChildren.isPresent()) {
return foundInChildren;
}
}
return Optional.empty();
}
private int findFirstIdxOfContainingChildBinarySearch(List<Entry> childNodes, int start) {
int low = 0;

View File

@ -12,6 +12,7 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.DuplicatedTextRange;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.Engine;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLog;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.entitylog.EntityLogChanges;
@ -286,6 +287,10 @@ public class EntityLogCreatorService {
.textBefore(entity.getTextBefore())
.startOffset(entity.getTextRange().start())
.endOffset(entity.getTextRange().end())
.duplicatedTextRanges(entity.getDuplicateTextRanges()
.stream()
.map(textRange -> DuplicatedTextRange.builder().start(textRange.start()).end(textRange.end()).build())
.toList())
.dossierDictionaryEntry(entity.isDossierDictionaryEntry())
.engines(getEngines(entity.getEngines(), entity.getManualOverwrite()))
//imported is no longer used, frontend should check engines

View File

@ -31,7 +31,7 @@ public class ComponentLogCreatorService {
});
List<ComponentLogEntry> componentLogComponents = map.entrySet()
.stream()
.map(entry -> new ComponentLogEntry(entry.getKey(), entry.getValue()))
.map(entry -> new ComponentLogEntry(entry.getKey(), entry.getValue(), false))
.toList();
return new ComponentLog(analysisNumber, componentRulesVersion, componentLogComponents);
}
@ -41,7 +41,6 @@ public class ComponentLogCreatorService {
return ComponentLogEntryValue.builder()
.value(component.getValue())
.originalValue(component.getValue())
.componentRuleId(component.getMatchedRule().toString())
.valueDescription(component.getValueDescription())
.componentLogEntityReferences(toComponentEntityReferences(component.getReferences()

View File

@ -1,5 +1,11 @@
package com.iqser.red.service.redaction.v1.server.service.document;
import static com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils.findEntitiesFromFirstSection;
import static com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils.findEntitiesFromLongestSection;
import static com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils.getFirstTableCell;
import static com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils.joinEntitiesOnSameRow;
import static com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils.joinTypes;
import java.text.BreakIterator;
import java.util.Collection;
import java.util.Collections;
@ -9,7 +15,6 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -19,10 +24,9 @@ import org.kie.api.runtime.KieSession;
import com.iqser.red.service.redaction.v1.server.model.component.Component;
import com.iqser.red.service.redaction.v1.server.model.component.Entity;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.model.drools.RuleIdentifier;
import com.iqser.red.service.redaction.v1.server.utils.ComponentCreationUtils;
import com.iqser.red.service.redaction.v1.server.utils.DateConverter;
import lombok.AccessLevel;
@ -37,24 +41,6 @@ public class ComponentCreationService {
Set<Entity> referencedEntities = new HashSet<>();
private static List<Entity> findEntitiesFromLongestSection(Collection<Entity> entities) {
var entitiesBySection = entities.stream()
.collect(Collectors.groupingBy(entity -> entity.getContainingNode().getHighestParent()));
Optional<SemanticNode> longestSection = entitiesBySection.entrySet()
.stream()
.sorted(Comparator.comparingInt(ComponentCreationService::getTotalLengthOfEntities).reversed())
.map(Map.Entry::getKey)
.findFirst();
if (longestSection.isEmpty()) {
return Collections.emptyList();
}
return entitiesBySection.get(longestSection.get());
}
/**
* Joins entity values, and creates a component from the result.
*
@ -87,15 +73,6 @@ public class ComponentCreationService {
}
private static String joinTypes(Collection<Entity> entities) {
return entities.stream()
.map(Entity::getType)
.distinct()
.collect(Collectors.joining(", "));
}
/**
* Creates a new component with the given parameters and inserts it into the kieSession.
*
@ -146,20 +123,6 @@ public class ComponentCreationService {
}
private static List<Entity> findEntitiesFromFirstSection(Collection<Entity> entities) {
var entitiesBySection = entities.stream()
.collect(Collectors.groupingBy(entity -> entity.getContainingNode().getHighestParent()));
Optional<SemanticNode> firstSection = entitiesBySection.keySet()
.stream()
.min(SemanticNodeComparators.first());
if (firstSection.isEmpty()) {
return Collections.emptyList();
}
return entitiesBySection.get(firstSection.get());
}
/**
* Joins unique entity values from the first section entities appear in, and creates a component from the result.
*
@ -252,14 +215,6 @@ public class ComponentCreationService {
}
private static int getTotalLengthOfEntities(Map.Entry<SemanticNode, List<Entity>> entry) {
return entry.getValue()
.stream()
.mapToInt(Entity::getLength).sum();
}
/**
* Joins unique entity values with delimiter ', ' from the section with the longest combined entity values only, and creates a component from the result.
*
@ -329,7 +284,7 @@ public class ComponentCreationService {
public void rowValueCount(String ruleIdentifier, String name, Collection<Entity> entities) {
entities.stream()
.collect(Collectors.groupingBy(this::getFirstTable))
.collect(Collectors.groupingBy(ComponentCreationUtils::getFirstTable))
.forEach((optionalTable, groupedEntities) -> {
if (optionalTable.isEmpty()) {
@ -491,60 +446,21 @@ public class ComponentCreationService {
.sorted(Comparator.reverseOrder())
.distinct()
.collect(Collectors.joining(", "));
String valueDescription = String.format("Combine values of %s that are in same table row", types);
entities.stream()
.collect(Collectors.groupingBy(this::getFirstTable))
.collect(Collectors.groupingBy(ComponentCreationUtils::getFirstTable))
.forEach((optionalTable, groupedEntities) -> {
if (optionalTable.isEmpty()) {
groupedEntities.forEach(entity -> create(ruleIdentifier, name, entity.getValue(), valueDescription, entity));
return;
}
groupedEntities.stream()
.filter(entity -> entity.getContainingNode() instanceof TableCell)
.collect(Collectors.groupingBy(entity -> ((TableCell) entity.getContainingNode()).getRow())).entrySet()
.stream()
.sorted(Comparator.comparingInt(Map.Entry::getKey))
.map(Map.Entry::getValue)
.forEach(entitiesInSameRow -> create(ruleIdentifier,
name,
entitiesInSameRow.stream()
.sorted(EntityComparators.first())
.map(Entity::getValue)
.collect(Collectors.joining(delimiter)),
valueDescription,
entitiesInSameRow));
joinEntitiesOnSameRow(ruleIdentifier, name, groupedEntities, valueDescription, delimiter, this::create);
});
}
private Optional<Table> getFirstTable(Entity entity) {
SemanticNode node = entity.getContainingNode();
while (!(node instanceof Table)) {
if (!node.hasParent()) {
return Optional.empty();
}
node = node.getParent();
}
return Optional.of((Table) node);
}
private Optional<TableCell> getFirstTableCell(Entity entity) {
SemanticNode node = entity.getContainingNode();
while (!(node instanceof TableCell)) {
if (!node.hasParent()) {
return Optional.empty();
}
node = node.getParent();
}
return Optional.of((TableCell) node);
}
/**
* Creates a new component with the given rule identifier, name, value, and value description.
* If the component is part of a table, it also takes a list of entities that belong to the same table row.

View File

@ -979,8 +979,8 @@ public class EntityCreationService {
return Optional.empty();
}
TextEntity entity = TextEntity.initialEntityNode(trimmedTextRange, type, entityType, node);
if (node.getEntities().contains(entity)) {
Optional<TextEntity> optionalTextEntity = node.getEntities()
if (node.getDocumentTree().getRoot().getNode().getEntities().contains(entity)) {
Optional<TextEntity> optionalTextEntity = node.getDocumentTree().getRoot().getNode().getEntities()
.stream()
.filter(e -> e.equals(entity) && e.type().equals(type))
.peek(e -> e.addEngines(engines))

View File

@ -1,5 +1,6 @@
package com.iqser.red.service.redaction.v1.server.service.drools;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
@ -67,8 +68,19 @@ public class ComponentDroolsExecutionService {
entityLog.getEntityLogEntry()
.stream()
.filter(this::isApplied)
.map(entry -> Entity.fromEntityLogEntry(entry, document))
.filter(entityLogEntry -> entityLogEntry.getState().equals(EntryState.APPLIED))
.flatMap(entry -> {
List<Entity> entities = new ArrayList<>();
entities.add(Entity.fromEntityLogEntry(entry, document, entry.getStartOffset(), entry.getEndOffset()));
if (entry.getDuplicatedTextRanges() != null && !entry.getDuplicatedTextRanges().isEmpty()) {
entry.getDuplicatedTextRanges().forEach(duplicatedTextRange -> {
entities.add(Entity.fromEntityLogEntry(entry, document, duplicatedTextRange.getStart(), duplicatedTextRange.getEnd()));
});
}
return entities.stream();
})
.forEach(kieSession::insert);
fileAttributes.stream()
.filter(f -> f.getValue() != null)
.forEach(kieSession::insert);

View File

@ -0,0 +1,136 @@
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.component.Entity;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.iqser.red.service.redaction.v1.server.service.document.EntityComparators;
import com.iqser.red.service.redaction.v1.server.service.document.SemanticNodeComparators;
public class ComponentCreationUtils {
public static TableCell getFirstTableCell(Collection<Entity> uniqueEntities) {
return (TableCell) uniqueEntities.stream()
.findFirst()
.orElseThrow(() -> new IllegalArgumentException("No entities found")).getContainingNode();
}
public static String joinTypes(Collection<Entity> entities) {
return entities.stream()
.map(Entity::getType)
.distinct()
.collect(Collectors.joining(", "));
}
public static List<Entity> findEntitiesFromLongestSection(Collection<Entity> entities) {
var entitiesBySection = entities.stream()
.collect(Collectors.groupingBy(entity -> entity.getContainingNode().getHighestParent()));
Optional<SemanticNode> longestSection = entitiesBySection.entrySet()
.stream()
.sorted(Comparator.comparingInt(ComponentCreationUtils::getTotalLengthOfEntities).reversed())
.map(Map.Entry::getKey)
.findFirst();
if (longestSection.isEmpty()) {
return Collections.emptyList();
}
return entitiesBySection.get(longestSection.get());
}
public static List<Entity> findEntitiesFromFirstSection(Collection<Entity> entities) {
var entitiesBySection = entities.stream()
.collect(Collectors.groupingBy(entity -> entity.getContainingNode().getHighestParent()));
Optional<SemanticNode> firstSection = entitiesBySection.keySet()
.stream()
.min(SemanticNodeComparators.first());
if (firstSection.isEmpty()) {
return Collections.emptyList();
}
return entitiesBySection.get(firstSection.get());
}
public static Optional<TableCell> getFirstTableCell(Entity entity) {
SemanticNode node = entity.getContainingNode();
while (!(node instanceof TableCell)) {
if (!node.hasParent()) {
return Optional.empty();
}
node = node.getParent();
}
return Optional.of((TableCell) node);
}
public static Optional<Table> getFirstTable(Entity entity) {
SemanticNode node = entity.getContainingNode();
while (!(node instanceof Table)) {
if (!node.hasParent()) {
return Optional.empty();
}
node = node.getParent();
}
return Optional.of((Table) node);
}
public static void joinEntitiesOnSameRow(String ruleIdentifier, String name, List<Entity> groupedEntities, String valueDescription, String delimiter, QuintConsumer<String, String, String, String, Collection<Entity>> create) {
groupedEntities.stream()
.filter(entity -> entity.getContainingNode() instanceof TableCell || entity.getContainingNode() instanceof Table)
.collect(Collectors.groupingBy(entity -> {
if (entity.getContainingNode() instanceof TableCell) {
return ((TableCell) entity.getContainingNode()).getRow();
} else {
DocumentTree documentTree = entity.getContainingNode().getDocumentTree();
Optional<TableCell> tableCell = documentTree.findTableCellInTable(entity.getContainingNode().getTreeId(),
entity.getStartOffset(),
entity.getEndOffset());
return tableCell.map(TableCell::getRow)
.orElse(0);
}
})).entrySet()
.stream()
.sorted(Comparator.comparingInt(Map.Entry::getKey))
.map(Map.Entry::getValue)
.forEach(entitiesInSameRow -> create.accept(ruleIdentifier,
name,
entitiesInSameRow.stream()
.sorted(EntityComparators.first())
.map(Entity::getValue)
.collect(Collectors.joining(delimiter)),
valueDescription,
entitiesInSameRow));
}
public static int getTotalLengthOfEntities(Map.Entry<SemanticNode, List<Entity>> entry) {
return entry.getValue()
.stream()
.mapToInt(Entity::getLength).sum();
}
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.utils;
import java.util.Collection;
@FunctionalInterface
public interface QuintConsumer<T, U, V, W, X extends Collection<?>> {
void accept(T t, U u, V v, W w, X x);
}

View File

@ -11,6 +11,7 @@ import java.time.OffsetDateTime;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@ -32,6 +33,7 @@ import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.AnalyzeResult;
import com.iqser.red.service.persistence.service.v1.api.shared.model.RuleFileType;
import com.iqser.red.service.persistence.service.v1.api.shared.model.analysislog.componentlog.ComponentLogEntryValue;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.ManualRedactions;
import com.iqser.red.service.persistence.service.v1.api.shared.model.annotations.entitymapped.IdRemoval;
import com.iqser.red.service.persistence.service.v1.api.shared.model.common.JSONPrimitive;
@ -286,4 +288,25 @@ public class DocumineFloraTest extends AbstractRedactionIntegrationTest {
.get(0).getValue());
}
@Test
public void testDoseMortalityExtraction() {
AnalyzeRequest request = uploadFileToStorage("files/syngenta/CustomerFiles/Documine/Flora/VV-547525_Toxicidade_Oral_Aguda.pdf");
System.out.println("Start Full integration test");
analyzeDocumentStructure(LayoutParsingType.DOCUMINE, request);
System.out.println("Finished structure analysis");
analyzeService.analyze(request);
System.out.println("Finished analysis");
var componentLog = redactionStorageService.getComponentLog(TEST_DOSSIER_ID, TEST_FILE_ID);
var doseMortality = componentLog.getComponentLogEntries().stream().filter(componentLogEntry -> componentLogEntry.getName().equals("Dose_Mortality")).findFirst().get();
assertEquals(doseMortality.getComponentValues().size(), 5);
Pattern pattern = Pattern.compile("^5000, [SD]$");
boolean allMatch = doseMortality.getComponentValues().stream().map(ComponentLogEntryValue::getValue).allMatch(pattern.asPredicate());
assertTrue(allMatch);
}
}

@ -1 +1 @@
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930
Subproject commit 5705cc0782605fdca5dfff134b436f7143c9e421