RED-7141: Adapted to layout parser using docstrum #312
@ -12,7 +12,7 @@ plugins {
|
||||
description = "redaction-service-server-v1"
|
||||
|
||||
|
||||
val layoutParserVersion = "0.94.0"
|
||||
val layoutParserVersion = "0.96.0"
|
||||
val jacksonVersion = "2.15.2"
|
||||
val droolsVersion = "9.44.0.Final"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
@ -57,7 +57,7 @@ public class Document implements GenericSemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
@ -72,8 +72,7 @@ public class Document implements GenericSemanticNode {
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock);
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.iqser.red.service.redaction.v1.server.model.document.nodes;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@SuperBuilder
|
||||
public class DuplicatedParagraph extends Paragraph {
|
||||
|
||||
TextBlock unsortedLeafTextBlock;
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector());
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -17,11 +17,12 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@FieldDefaults(level = AccessLevel.PROTECTED)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Paragraph implements GenericSemanticNode {
|
||||
|
||||
|
||||
@ -69,9 +69,7 @@ public class Section implements GenericSemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.entity.TextEntity;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.AtomicTextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBlockCollector;
|
||||
import com.iqser.red.service.redaction.v1.server.service.document.NodeVisitor;
|
||||
import com.iqser.red.service.redaction.v1.server.utils.RectangleTransformations;
|
||||
import com.iqser.red.service.redaction.v1.server.utils.RedactionSearchUtility;
|
||||
@ -41,7 +42,11 @@ public interface SemanticNode {
|
||||
*
|
||||
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock getTextBlock();
|
||||
default TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
||||
@ -408,9 +408,7 @@ public class Table implements SemanticNode {
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
textBlock = SemanticNode.super.getTextBlock();
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.DuplicatedParagraph;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Image;
|
||||
@ -66,7 +67,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context);
|
||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||
case HEADLINE -> buildHeadline(context);
|
||||
case HEADER -> buildHeader(context);
|
||||
case FOOTER -> buildFooter(context);
|
||||
@ -148,13 +149,22 @@ public class DocumentGraphMapper {
|
||||
}
|
||||
|
||||
|
||||
private Paragraph buildParagraph(Context context) {
|
||||
private Paragraph buildParagraph(Context context, Map<String, String> properties) {
|
||||
|
||||
if (PropertiesMapper.isDuplicateParagraph(properties)) {
|
||||
|
||||
DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build();
|
||||
|
||||
Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph));
|
||||
return duplicatedParagraph;
|
||||
}
|
||||
|
||||
return Paragraph.builder().documentTree(context.documentTree).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||
|
||||
@ -49,4 +49,23 @@ public class PropertiesMapper {
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
public static boolean isDuplicateParagraph(Map<String, String> properties) {
|
||||
|
||||
return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID);
|
||||
}
|
||||
|
||||
|
||||
public static Long[] getUnsortedTextblockIds(Map<String, String> properties) {
|
||||
|
||||
return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID));
|
||||
}
|
||||
|
||||
|
||||
public static Long[] toLongArray(String ids) {
|
||||
|
||||
return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -246,6 +246,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void redactionExpansionOverlap() {
|
||||
|
||||
// F. Lastname, J. Doe, M. Mustermann
|
||||
@ -297,7 +298,7 @@ public class RedactionIntegrationTest extends AbstractRedactionIntegrationTest {
|
||||
@Test
|
||||
public void titleExtraction() throws IOException {
|
||||
|
||||
AnalyzeRequest request = uploadFileToStorage("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
AnalyzeRequest request = uploadFileToStorage("files/new/crafted document.pdf");
|
||||
System.out.println("Start Full integration test");
|
||||
analyzeDocumentStructure(LayoutParsingType.REDACT_MANAGER, request);
|
||||
System.out.println("Finished structure analysis");
|
||||
|
||||
@ -272,7 +272,7 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
|
||||
public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() {
|
||||
|
||||
Document document = buildGraph("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
||||
String searchTerm = "absorption, distribution, metabolism";
|
||||
String searchTerm = "of subject matter";
|
||||
int start = document.getTextBlock().indexOf(searchTerm);
|
||||
assert start != -1;
|
||||
start = document.getTextBlock().indexOf(searchTerm, start + 1);
|
||||
@ -282,16 +282,16 @@ public class DocumentIEntityInsertionIntegrationTest extends BuildDocumentIntegr
|
||||
TextEntity textEntity = TextEntity.initialEntityNode(textRange, "123", EntityType.ENTITY, document);
|
||||
entityCreationService.addEntityToGraph(textEntity, document);
|
||||
|
||||
assertEquals("2.6.1 Summary of ", textEntity.getTextBefore());
|
||||
assertEquals(" and excretion in", textEntity.getTextAfter());
|
||||
assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ",
|
||||
assertEquals("1 Statement ", textEntity.getTextBefore());
|
||||
assertEquals(" and purpose for", textEntity.getTextAfter());
|
||||
assertEquals("1 Statement of subject matter and purpose for which this report has been prepared and background information on the application ",
|
||||
textEntity.getDeepestFullyContainingNode().getHeadline().getTextBlock().getSearchText());
|
||||
assertEquals(searchTerm, textEntity.getValue());
|
||||
assertEquals(3, textEntity.getIntersectingNodes().size());
|
||||
assertEquals(4, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
|
||||
assertEquals(1, textEntity.getDeepestFullyContainingNode().getNumberOnPage());
|
||||
assertTrue(textEntity.getPages()
|
||||
.stream()
|
||||
.allMatch(pageNode -> pageNode.getNumber() == 33));
|
||||
.allMatch(pageNode -> pageNode.getNumber() == 9));
|
||||
assertInstanceOf(Headline.class, textEntity.getDeepestFullyContainingNode());
|
||||
|
||||
assertSameOffsetInAllIntersectingNodes(searchTerm, textEntity);
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user