RED-9374: Ner Entities are at wrong locations

This commit is contained in:
Maverick Studer 2024-06-19 12:15:50 +02:00
parent 36c7a61265
commit 7918db25a9
6 changed files with 26070 additions and 783 deletions

View File

@ -12,7 +12,7 @@ plugins {
description = "redaction-service-server-v1"
val layoutParserVersion = "0.139.0"
val layoutParserVersion = "0.141.0"
val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"

View File

@ -11,7 +11,7 @@ import lombok.NoArgsConstructor;
@NoArgsConstructor
public class EntityRecognitionSection {
private int sectionNumber;
private String sectionNumber;
private String text;
}

View File

@ -13,6 +13,6 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class NerEntitiesModel {
private Map<Integer, List<EntityRecognitionEntity>> data = new HashMap<>();
private Map<String, List<EntityRecognitionEntity>> data = new HashMap<>();
}

View File

@ -1,10 +1,13 @@
package com.iqser.red.service.redaction.v1.server.service.document;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.client.model.EntityRecognitionEntity;
@ -14,7 +17,6 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@ -44,11 +46,10 @@ public class NerEntitiesAdapter {
*/
public NerEntities toNerEntities(NerEntitiesModel nerEntitiesModel, Document document) {
return new NerEntities(addOffsetsAndFlatten(getStringStartOffsetsForMainSectionsHeadersFooters(document),
nerEntitiesModel).map(nerEntityModel -> new NerEntities.NerEntity(nerEntityModel.getValue(),
new TextRange(nerEntityModel.getStartOffset(),
nerEntityModel.getEndOffset()),
nerEntityModel.getType()))
return new NerEntities(addOffsetsAndFlatten(getStringStartOffsetsForMainSectionsHeadersFooters(document), nerEntitiesModel).map(nerEntityModel -> new NerEntities.NerEntity(
nerEntityModel.getValue(),
new TextRange(nerEntityModel.getStartOffset(), nerEntityModel.getEndOffset()),
nerEntityModel.getType()))
.toList());
}
@ -161,11 +162,12 @@ public class NerEntitiesAdapter {
}
private static Stream<EntityRecognitionEntity> addOffsetsAndFlatten(List<Integer> stringOffsetsForMainSectionsHeadersFooters, NerEntitiesModel nerEntitiesModel) {
private static Stream<EntityRecognitionEntity> addOffsetsAndFlatten(Map<List<Integer>, Integer> stringOffsetsForMainSectionsHeadersFooters, NerEntitiesModel nerEntitiesModel) {
nerEntitiesModel.getData()
.forEach((sectionNumber, listOfNerEntities) -> listOfNerEntities.forEach(entityRecognitionEntity -> {
int newStartOffset = entityRecognitionEntity.getStartOffset() + stringOffsetsForMainSectionsHeadersFooters.get(sectionNumber);
int newStartOffset = entityRecognitionEntity.getStartOffset() + stringOffsetsForMainSectionsHeadersFooters.getOrDefault(sectionNumberToTreeId(sectionNumber),
0);
entityRecognitionEntity.setStartOffset(newStartOffset);
entityRecognitionEntity.setEndOffset(newStartOffset + entityRecognitionEntity.getValue().length());
}));
@ -175,14 +177,19 @@ public class NerEntitiesAdapter {
}
private static List<Integer> getStringStartOffsetsForMainSectionsHeadersFooters(Document document) {
private static List<Integer> sectionNumberToTreeId(String sectionNumber) {
return Arrays.stream(sectionNumber.split("\\."))
.map(Integer::parseInt)
.collect(Collectors.toList());
}
private static Map<List<Integer>, Integer> getStringStartOffsetsForMainSectionsHeadersFooters(Document document) {
return document.streamAllSubNodes()
.filter(child -> (child.getType().equals(NodeType.FOOTER) ||child.getType().equals(NodeType.HEADER) ||child.getType().equals(NodeType.SECTION)))
.map(SemanticNode::getTextBlock)
.map(TextBlock::getTextRange)
.map(TextRange::start)
.toList();
.filter(child -> child.getType().equals(NodeType.FOOTER) || child.getType().equals(NodeType.HEADER) || child.getType().equals(NodeType.SECTION))
.collect(Collectors.toMap(SemanticNode::getTreeId, child -> child.getTextBlock().getTextRange().start()));
}
}

View File

@ -308,7 +308,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
assertEquals("2-[(2-(1-hydroxy-ethyl)-6methyl-phenyl-amino]propan-1-ol (", textEntity.getTextBefore());
assertEquals(" of metabolite of", textEntity.getTextAfter());
assertEquals(searchTerm, textEntity.getValue());
assertEquals(7, textEntity.getIntersectingNodes().size());
assertEquals(8, textEntity.getIntersectingNodes().size());
assertEquals("Table 2.7-1: List of substances and metabolites and related structural formula ",
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
assertTrue(textEntity.getPages()