RED-7141: Adapted to layout parser using docstrum
This commit is contained in:
parent
fa55917a89
commit
b93f9a2c20
@ -12,7 +12,7 @@ plugins {
|
|||||||
description = "redaction-service-server-v1"
|
description = "redaction-service-server-v1"
|
||||||
|
|
||||||
|
|
||||||
val layoutParserVersion = "0.94.0"
|
val layoutParserVersion = "0.96.0"
|
||||||
val jacksonVersion = "2.15.2"
|
val jacksonVersion = "2.15.2"
|
||||||
val droolsVersion = "9.44.0.Final"
|
val droolsVersion = "9.44.0.Final"
|
||||||
val pdfBoxVersion = "3.0.0"
|
val pdfBoxVersion = "3.0.0"
|
||||||
|
|||||||
@ -57,7 +57,7 @@ public class Document implements GenericSemanticNode {
|
|||||||
public TextBlock getTextBlock() {
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
}
|
}
|
||||||
@ -72,8 +72,7 @@ public class Document implements GenericSemanticNode {
|
|||||||
|
|
||||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||||
|
|
||||||
return streamAllNodes().filter(SemanticNode::isLeaf)
|
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
|
||||||
.map(SemanticNode::getLeafTextBlock);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,34 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
|
||||||
|
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
@SuperBuilder
|
||||||
|
public class DuplicatedParagraph extends Paragraph {
|
||||||
|
|
||||||
|
TextBlock unsortedLeafTextBlock;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return super.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -17,11 +17,12 @@ import lombok.Builder;
|
|||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@Builder
|
@SuperBuilder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(level = AccessLevel.PROTECTED)
|
||||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||||
public class Paragraph implements GenericSemanticNode {
|
public class Paragraph implements GenericSemanticNode {
|
||||||
|
|
||||||
|
|||||||
@ -69,9 +69,7 @@ public class Section implements GenericSemanticNode {
|
|||||||
public TextBlock getTextBlock() {
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||||
.map(SemanticNode::getLeafTextBlock)
|
|
||||||
.collect(new TextBlockCollector());
|
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -19,6 +19,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
|||||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||||
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
|
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
|
||||||
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
||||||
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
|
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
|
||||||
@ -41,7 +42,11 @@ public interface SemanticNode {
|
|||||||
*
|
*
|
||||||
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||||
*/
|
*/
|
||||||
TextBlock getTextBlock();
|
default TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -408,9 +408,7 @@ public class Table implements SemanticNode {
|
|||||||
public TextBlock getTextBlock() {
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
if (textBlock == null) {
|
if (textBlock == null) {
|
||||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
textBlock = SemanticNode.super.getTextBlock();
|
||||||
.map(SemanticNode::getLeafTextBlock)
|
|
||||||
.collect(new TextBlockCollector());
|
|
||||||
}
|
}
|
||||||
return textBlock;
|
return textBlock;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.util.NoSuchElementException;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
|
||||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
|
||||||
@ -66,7 +67,7 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
case SECTION -> buildSection(context);
|
case SECTION -> buildSection(context);
|
||||||
case PARAGRAPH -> buildParagraph(context);
|
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||||
case HEADLINE -> buildHeadline(context);
|
case HEADLINE -> buildHeadline(context);
|
||||||
case HEADER -> buildHeader(context);
|
case HEADER -> buildHeader(context);
|
||||||
case FOOTER -> buildFooter(context);
|
case FOOTER -> buildFooter(context);
|
||||||
@ -148,13 +149,22 @@ public class DocumentGraphMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Paragraph buildParagraph(Context context) {
|
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||||
|
|
||||||
|
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||||
|
|
||||||
|
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||||
|
|
||||||
|
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||||
|
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||||
|
return duplicatedParagraph;
|
||||||
|
}
|
||||||
|
|
||||||
return Paragraph.builder().documentTree(context.documentTree).build();
|
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||||
|
|
||||||
return Arrays.stream(atomicTextBlockIds)
|
return Arrays.stream(atomicTextBlockIds)
|
||||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||||
|
|||||||
@ -49,4 +49,23 @@ public class PropertiesMapper {
|
|||||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||||
|
|
||||||
|
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
||||||
|
|
||||||
|
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Long[] toLongArray(String ids) {
|
||||||
|
|
||||||
|
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -246,6 +246,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
public void redactionExpansionOverlap() {
|
public void redactionExpansionOverlap() {
|
||||||
|
|
||||||
// F. Lastname, J. Doe, M. Mustermann
|
// F. Lastname, J. Doe, M. Mustermann
|
||||||
@ -297,7 +298,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void titleExtraction() throws IOException {
|
public void titleExtraction() throws IOException {
|
||||||
|
|
||||||
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
|
||||||
System.out.println("Start Full integration test");
|
System.out.println("Start Full integration test");
|
||||||
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
||||||
System.out.println("Finished structure analysis");
|
System.out.println("Finished structure analysis");
|
||||||
|
|||||||
@ -272,7 +272,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
|
|||||||
public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {
|
public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {
|
||||||
|
|
||||||
Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
||||||
String searchTerm = "absorption, distribution, metabolism";
|
String searchTerm = "of subject matter";
|
||||||
int start = document.getTextBlock().indexOf(searchTerm);
|
int start = document.getTextBlock().indexOf(searchTerm);
|
||||||
assert start != -1;
|
assert start != -1;
|
||||||
start = document.getTextBlock().indexOf(searchTerm, start + 1);
|
start = document.getTextBlock().indexOf(searchTerm, start + 1);
|
||||||
@ -282,16 +282,16 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
|
|||||||
TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
|
TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
|
||||||
entityCreationService.addEntityToGraph(textEntity, document);
|
entityCreationService.addEntityToGraph(textEntity, document);
|
||||||
|
|
||||||
assertEquals("2.6.1 Summary of ", textEntity.getTextBefore());
|
assertEquals("1 Statement ", textEntity.getTextBefore());
|
||||||
assertEquals(" and excretion in", textEntity.getTextAfter());
|
assertEquals(" and purpose for", textEntity.getTextAfter());
|
||||||
assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ",
|
assertEquals("1 Statement of subject matter and purpose for which this report has been prepared and background information on the application ",
|
||||||
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
|
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
|
||||||
assertEquals(searchTerm, textEntity.getValue());
|
assertEquals(searchTerm, textEntity.getValue());
|
||||||
assertEquals(3, textEntity.getIntersectingNodes().size());
|
assertEquals(3, textEntity.getIntersectingNodes().size());
|
||||||
assertEquals(4, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
|
assertEquals(1, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
|
||||||
assertTrue(textEntity.getPages()
|
assertTrue(textEntity.getPages()
|
||||||
.stream()
|
.stream()
|
||||||
.allMatch(pageNode -> pageNode.getNumber() == 33));
|
.allMatch(pageNode -> pageNode.getNumber() == 9));
|
||||||
assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());
|
assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());
|
||||||
|
|
||||||
assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);
|
assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user