RED-7141: Adapted to layout parser using docstrum

This commit is contained in:
Dominique Eifländer 2024-03-05 13:06:36 +01:00
parent fa55917a89
commit b93f9a2c20
13 changed files with 90 additions and 30621 deletions

View File

@ -12,7 +12,7 @@ plugins {
description = "redaction-service-server-v1"
val layoutParserVersion = "0.94.0"
val layoutParserVersion = "0.96.0"
val jacksonVersion = "2.15.2"
val droolsVersion = "9.44.0.Final"
val pdfBoxVersion = "3.0.0"

View File

@ -57,7 +57,7 @@ public class Document implements GenericSemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}
@ -72,8 +72,7 @@ public class Document implements GenericSemanticNode {
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock);
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
}

View File

@ -0,0 +1,34 @@
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
import java.util.stream.Stream;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
public class DuplicatedParagraph extends Paragraph {
TextBlock unsortedLeafTextBlock;
@Override
public TextBlock getTextBlock() {
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -17,11 +17,12 @@ import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PROTECTED)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Paragraph implements GenericSemanticNode {

View File

@ -69,9 +69,7 @@ public class Section implements GenericSemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
textBlock = GenericSemanticNode.super.getTextBlock();
}
return textBlock;
}

View File

@ -19,6 +19,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
@ -41,7 +42,11 @@ public interface SemanticNode {
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock getTextBlock();
default TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
}
/**

View File

@ -408,9 +408,7 @@ public class Table implements SemanticNode {
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
textBlock = SemanticNode.super.getTextBlock();
}
return textBlock;
}

View File

@ -9,6 +9,7 @@ import java.util.NoSuchElementException;
import java.util.Set;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
@ -66,7 +67,7 @@ public class DocumentGraphMapper {
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context);
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
case HEADLINE -> buildHeadline(context);
case HEADER -> buildHeader(context);
case FOOTER -> buildFooter(context);
@ -148,13 +149,22 @@ public class DocumentGraphMapper {
}
private Paragraph buildParagraph(Context context) {
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
if (PropertiesMapper.isDuplicateParagraph(properties)) {
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
return duplicatedParagraph;
}
return Paragraph.builder().documentTree(context.documentTree).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))

View File

@ -49,4 +49,23 @@ public class PropertiesMapper {
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public static boolean isDuplicateParagraph(Map<String, String> properties) {
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
}
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
}
public static Long[] toLongArray(String ids) {
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
}
}

View File

@ -246,6 +246,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test
@SneakyThrows
public void redactionExpansionOverlap() {
// F. Lastname, J. Doe, M. Mustermann
@ -297,7 +298,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
@Test
public void titleExtraction() throws IOException {
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
System.out.println("Start Full integration test");
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
System.out.println("Finished structure analysis");

View File

@ -272,7 +272,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {
Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
String searchTerm = "absorption, distribution, metabolism";
String searchTerm = "of subject matter";
int start = document.getTextBlock().indexOf(searchTerm);
assert start != -1;
start = document.getTextBlock().indexOf(searchTerm, start + 1);
@ -282,16 +282,16 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
entityCreationService.addEntityToGraph(textEntity, document);
assertEquals("2.6.1 Summary of ", textEntity.getTextBefore());
assertEquals(" and excretion in", textEntity.getTextAfter());
assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ",
assertEquals("1 Statement ", textEntity.getTextBefore());
assertEquals(" and purpose for", textEntity.getTextAfter());
assertEquals("1 Statement of subject matter and purpose for which this report has been prepared and background information on the application ",
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
assertEquals(searchTerm, textEntity.getValue());
assertEquals(3, textEntity.getIntersectingNodes().size());
assertEquals(4, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
assertEquals(1, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
assertTrue(textEntity.getPages()
.stream()
.allMatch(pageNode -> pageNode.getNumber() == 33));
.allMatch(pageNode -> pageNode.getNumber() == 9));
assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());
assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);