RED-7141: Adapted to layout parser using docstrum

This commit is contained in:
Dominique Eifländer 2024-03-05 13:06:36 +01:00
parent fa55917a89
commit b93f9a2c20
13 changed files with 90 additions and 30621 deletions

View File

@ -12,7 +12,7 @@ plugins {
description = "redaction-service-server-v1" description = "redaction-service-server-v1"
val layoutParserVersion = "0.94.0" val layoutParserVersion = "0.96.0"
val jacksonVersion = "2.15.2" val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final" val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0" val pdfBoxVersion = "3.0.0"

View File

@ -57,7 +57,7 @@ public class Document implements GenericSemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); textBlock = GenericSemanticNode.super.getTextBlock();
} }
return textBlock; return textBlock;
} }
@ -72,8 +72,7 @@ public class Document implements GenericSemanticNode {
public Stream<TextBlock> streamTerminalTextBlocksInOrder() { public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf) return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
.map(SemanticNode::getLeafTextBlock);
} }

View File

@ -0,0 +1,34 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -17,11 +17,12 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data @Data
@Builder @SuperBuilder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PROTECTED)
@EqualsAndHashCode(onlyExplicitlyIncluded = true) @EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Paragraph implements GenericSemanticNode { public class Paragraph implements GenericSemanticNode {

View File

@ -69,9 +69,7 @@ public class Section implements GenericSemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) textBlock = GenericSemanticNode.super.getTextBlock();
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
} }
return textBlock; return textBlock;
} }

View File

@ -19,6 +19,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity; import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock; import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor; import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations; import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility; import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
@ -41,7 +42,11 @@ public interface SemanticNode {
* *
* @return TextBlock containing all AtomicTextBlocks that are located under this Node. * @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/ */
TextBlock getTextBlock(); default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
}
/** /**

View File

@ -408,9 +408,7 @@ public class Table implements SemanticNode {
public TextBlock getTextBlock() { public TextBlock getTextBlock() {
if (textBlock == null) { if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) textBlock = SemanticNode.super.getTextBlock();
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
} }
return textBlock; return textBlock;
} }

View File

@ -9,6 +9,7 @@ import java.util.NoSuchElementException;
import java.util.Set; import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
@ -66,7 +67,7 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) { SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context); case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context); case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context); case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context); case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context); case FOOTER -> buildFooter(context);
@ -148,13 +149,22 @@ public class DocumentGraphMapper {
} }
private Paragraph buildParagraph(Context context) { private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build(); return Paragraph.builder().documentTree(context.documentTree).build();
} }
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds) return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)) .map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))

View File

@ -49,4 +49,23 @@ public class PropertiesMapper {
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
} }
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
}
} }

View File

@ -246,6 +246,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test @Test
@SneakyThrows
public void redactionExpansionOverlap() { public void redactionExpansionOverlap() {
// F. Lastname, J. Doe, M. Mustermann // F. Lastname, J. Doe, M. Mustermann
@ -297,7 +298,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test @Test
public void titleExtraction() throws IOException { public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
System.out.println("Start Full integration test"); System.out.println("Start Full integration test");
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request); analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
System.out.println("Finished structure analysis"); System.out.println("Finished structure analysis");

View File

@ -272,7 +272,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() { public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {
Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
String searchTerm = "absorption, distribution, metabolism"; String searchTerm = "of subject matter";
int start = document.getTextBlock().indexOf(searchTerm); int start = document.getTextBlock().indexOf(searchTerm);
assert start != -1; assert start != -1;
start = document.getTextBlock().indexOf(searchTerm, start + 1); start = document.getTextBlock().indexOf(searchTerm, start + 1);
@ -282,16 +282,16 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document); TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
entityCreationService.addEntityToGraph(textEntity, document); entityCreationService.addEntityToGraph(textEntity, document);
assertEquals("2.6.1 Summary of ", textEntity.getTextBefore()); assertEquals("1 Statement ", textEntity.getTextBefore());
assertEquals(" and excretion in", textEntity.getTextAfter()); assertEquals(" and purpose for", textEntity.getTextAfter());
assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ", assertEquals("1 Statement of subject matter and purpose for which this report has been prepared and background information on the application ",
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText()); textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
assertEquals(searchTerm, textEntity.getValue()); assertEquals(searchTerm, textEntity.getValue());
assertEquals(3, textEntity.getIntersectingNodes().size()); assertEquals(3, textEntity.getIntersectingNodes().size());
assertEquals(4, textEntity.getDeepestFullyContainingNode().getNumberOnPage()); assertEquals(1, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
assertTrue(textEntity.getPages() assertTrue(textEntity.getPages()
.stream() .stream()
.allMatch(pageNode -> pageNode.getNumber() == 33)); .allMatch(pageNode -> pageNode.getNumber() == 9));
assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode()); assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());
assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity); assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);