RED-6009: Document Tree Structure

This commit is contained in:
Kilian Schüttler 2023-06-15 21:44:25 +02:00
parent 7694c11dd6
commit bc605aec8c
5 changed files with 28 additions and 6 deletions

View File

@ -147,6 +147,9 @@ public class Boundary implements Comparable<Boundary> {
*/
public Boundary trim(TextBlock textBlock) {
if (textBlock.isEmpty()) {
return textBlock.getBoundary();
}
int trimmedStart = this.start;
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
trimmedStart++;

View File

@ -149,6 +149,10 @@ public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.isEmpty()) {
return new HashMap<>();
}
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
}

View File

@ -188,13 +188,17 @@ public class EntityCreationService {
}
public RedactionEntity bySemanticNode(SemanticNode node, String type, EntityType entityType) {
public Optional<RedactionEntity> bySemanticNode(SemanticNode node, String type, EntityType entityType) {
Boundary boundary = node.getTextBlock().getBoundary();
if (boundary.length() > 0) {
boundary = new Boundary(boundary.start(), boundary.end() - 1);
}
return byBoundary(boundary, type, entityType, node);
if (!isValidEntityBoundary(node.getTextBlock(), boundary)) {
return Optional.empty();
}
return Optional.of(byBoundary(boundary, type, entityType, node));
}
@ -294,7 +298,7 @@ public class EntityCreationService {
private boolean isValidEntityBoundary(TextBlock textBlock, Boundary boundary) {
return boundaryIsSurroundedBySeparators(textBlock, boundary);
return boundary.length() > 0 && boundaryIsSurroundedBySeparators(textBlock, boundary);
}

View File

@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.when;
import static org.wildfly.common.Assert.assertTrue;
@ -220,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf");
System.out.println("Start Full integration test");
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
System.out.println("Finished structure analysis");
@ -298,7 +300,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
ClassLoader loader = getClass().getClassLoader();
URL url = loader.getResource("files");
Path path = Paths.get(URI.create(url.toString()));
when(dictionaryClient.getDictionaryForType(anyString(), anyLong())).thenReturn(new Type());
Files.walk(path)//
.filter(currentPath -> currentPath.toString().endsWith(".pdf"))//
.map(currentPath -> path.getParent().relativize(currentPath))//

View File

@ -6,10 +6,11 @@ import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.u
import java.util.List;
import java.util.LinkedList;
import java.util.Set
import java.util.Set;
import java.util.stream.Collectors;
import java.util.Collection;
import java.util.stream.Stream;
import java.util.Optional;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.*;
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.*;
@ -249,6 +250,8 @@ rule "CBI.9.0: Redact all Cell's with Header Author(s) as CBI_author (non verteb
then
$table.streamTableCellsWithHeader("Author(s)")
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
.filter(Optional::isPresent)
.map(Optional::get)
.forEach(redactionEntity -> {
redactionEntity.setRedaction(true);
redactionEntity.addMatchedRule(9);
@ -267,6 +270,8 @@ rule "CBI.9.1: Redact all Cell's with Header Author as CBI_author (non vertebrat
then
$table.streamTableCellsWithHeader("Author")
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
.filter(Optional::isPresent)
.map(Optional::get)
.forEach(redactionEntity -> {
redactionEntity.setRedaction(true);
redactionEntity.addMatchedRule(9);
@ -300,6 +305,8 @@ rule "CBI.12.0: Add all Cell's with Header Author(s) as CBI_author"
$table.streamTableCellsWithHeader("Author")
)
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
.filter(Optional::isPresent)
.map(Optional::get)
.forEach(redactionEntity -> {
redactionEntity.addMatchedRule(12);
redactionEntity.setRedactionReason("Author(s) header found");
@ -998,6 +1005,8 @@ rule "ETC.6.0: Redact CAS Number"
then
$table.streamTableCellsWithHeader("Sample #")
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "PII", EntityType.ENTITY))
.filter(Optional::isPresent)
.map(Optional::get)
.forEach(redactionEntity -> {
redactionEntity.setRedaction(true);
redactionEntity.addMatchedRule(101);