RED-6009: Document Tree Structure
This commit is contained in:
parent
7694c11dd6
commit
bc605aec8c
@ -147,6 +147,9 @@ public class Boundary implements Comparable<Boundary> {
|
||||
*/
|
||||
public Boundary trim(TextBlock textBlock) {
|
||||
|
||||
if (textBlock.isEmpty()) {
|
||||
return textBlock.getBoundary();
|
||||
}
|
||||
int trimmedStart = this.start;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||
trimmedStart++;
|
||||
|
||||
@ -149,6 +149,10 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.isEmpty()) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
|
||||
}
|
||||
|
||||
@ -188,13 +188,17 @@ public class EntityCreationService {
|
||||
}
|
||||
|
||||
|
||||
public RedactionEntity bySemanticNode(SemanticNode node, String type, EntityType entityType) {
|
||||
public Optional<RedactionEntity> bySemanticNode(SemanticNode node, String type, EntityType entityType) {
|
||||
|
||||
Boundary boundary = node.getTextBlock().getBoundary();
|
||||
|
||||
if (boundary.length() > 0) {
|
||||
boundary = new Boundary(boundary.start(), boundary.end() - 1);
|
||||
}
|
||||
return byBoundary(boundary, type, entityType, node);
|
||||
if (!isValidEntityBoundary(node.getTextBlock(), boundary)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of(byBoundary(boundary, type, entityType, node));
|
||||
}
|
||||
|
||||
|
||||
@ -294,7 +298,7 @@ public class EntityCreationService {
|
||||
|
||||
private boolean isValidEntityBoundary(TextBlock textBlock, Boundary boundary) {
|
||||
|
||||
return boundaryIsSurroundedBySeparators(textBlock, boundary);
|
||||
return boundary.length() > 0 && boundaryIsSurroundedBySeparators(textBlock, boundary);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -2,6 +2,8 @@ package com.iqser.red.service.redaction.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.ArgumentMatchers.anyLong;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.wildfly.common.Assert.assertTrue;
|
||||
|
||||
@ -220,7 +222,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
||||
@Test
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/new/S157.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
System.out.println("Finished structure analysis");
|
||||
@ -298,7 +300,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
||||
ClassLoader loader = getClass().getClassLoader();
|
||||
URL url = loader.getResource("files");
|
||||
Path path = Paths.get(URI.create(url.toString()));
|
||||
|
||||
when(dictionaryClient.getDictionaryForType(anyString(), anyLong())).thenReturn(new Type());
|
||||
Files.walk(path)//
|
||||
.filter(currentPath -> currentPath.toString().endsWith(".pdf"))//
|
||||
.map(currentPath -> path.getParent().relativize(currentPath))//
|
||||
|
||||
@ -6,10 +6,11 @@ import static com.iqser.red.service.redaction.v1.server.layoutparsing.document.u
|
||||
|
||||
import java.util.List;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Collection;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.*;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.*;
|
||||
@ -249,6 +250,8 @@ rule "CBI.9.0: Redact all Cell's with Header Author(s) as CBI_author (non verteb
|
||||
then
|
||||
$table.streamTableCellsWithHeader("Author(s)")
|
||||
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.forEach(redactionEntity -> {
|
||||
redactionEntity.setRedaction(true);
|
||||
redactionEntity.addMatchedRule(9);
|
||||
@ -267,6 +270,8 @@ rule "CBI.9.1: Redact all Cell's with Header Author as CBI_author (non vertebrat
|
||||
then
|
||||
$table.streamTableCellsWithHeader("Author")
|
||||
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.forEach(redactionEntity -> {
|
||||
redactionEntity.setRedaction(true);
|
||||
redactionEntity.addMatchedRule(9);
|
||||
@ -300,6 +305,8 @@ rule "CBI.12.0: Add all Cell's with Header Author(s) as CBI_author"
|
||||
$table.streamTableCellsWithHeader("Author")
|
||||
)
|
||||
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "CBI_author", EntityType.ENTITY))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.forEach(redactionEntity -> {
|
||||
redactionEntity.addMatchedRule(12);
|
||||
redactionEntity.setRedactionReason("Author(s) header found");
|
||||
@ -998,6 +1005,8 @@ rule "ETC.6.0: Redact CAS Number"
|
||||
then
|
||||
$table.streamTableCellsWithHeader("Sample #")
|
||||
.map(tableCell -> entityCreationService.bySemanticNode(tableCell, "PII", EntityType.ENTITY))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.forEach(redactionEntity -> {
|
||||
redactionEntity.setRedaction(true);
|
||||
redactionEntity.addMatchedRule(101);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user