RED-7074: Design Subsection section tree structure algorithm

This commit is contained in:
Maverick Studer 2024-06-04 15:07:40 +02:00
parent b6742c1e89
commit fc06dba2ce
7 changed files with 408 additions and 5 deletions

View File

@ -43,6 +43,11 @@ public class Document extends AbstractSemanticNode {
}
/**
* Gets the sections of the document as a list.
*
* @return A list of all sections within the document.
*/
public List<Section> getAllSections() {
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
@ -50,6 +55,34 @@ public class Document extends AbstractSemanticNode {
}
/**
* Gets the main sections of the document as a list.
*
* @return A list of main sections within the document
* @deprecated This method is marked for removal.
* Use {@link #streamChildrenOfType(NodeType)} instead,
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
*/
@Deprecated(forRemoval = true)
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
/**
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
*
* @return A list of all children of type SECTION or SUPER_SECTION.
*/
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
.toList();
}
public List<Header> getHeaders() {
return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node)

View File

@ -29,7 +29,7 @@ public class TOCEnrichmentService {
TableOfContents toc = document.getTableOfContents();
Iterator<TableOfContentItem> iterator = toc.iterator();
TableOfContentItem currentTOCItem = null;
if(iterator.hasNext()) {
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
@ -79,7 +79,7 @@ public class TOCEnrichmentService {
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if(iterator.hasNext()) {
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
} else if (!foundFirstHeadline) {
@ -149,9 +149,9 @@ public class TOCEnrichmentService {
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses first paragraph");
log.debug("Image uses last found section");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(0).getImages().add(image);
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
} else {
startImages.add(image);
}

View File

@ -0,0 +1,231 @@
package com.knecon.fforesight.service.layoutparser.server;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.nio.file.Path;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import lombok.SneakyThrows;
public class OutlineDetectionTest extends AbstractTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
@Autowired
protected LayoutParsingPipeline layoutParsingPipeline;
@Test
@SneakyThrows
public void testOutlinesToSections() {
String fileName = "files/new/crafted_outline_test_doc.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(1).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(6).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(7).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(8).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(10).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(11).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(12).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
.get(13).size(), 2);
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
.stream()
.flatMap(Collection::stream)
.allMatch(OutlineObject::isFound));
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
assertEquals(tableOfContents.getMainSections().size(), 9);
assertEquals(tableOfContents.getMainSections().subList(1, 9)
.stream()
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
.toList(),
outlineObjectTree.getRootNodes()
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
assertEquals(tableOfContents.getMainSections()
.get(5).getChildren().size(), 6);
assertEquals(tableOfContents.getMainSections()
.get(7).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren().size(), 3);
assertEquals(tableOfContents.getMainSections()
.get(0).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(6).getImages().size(), 1);
assertEquals(tableOfContents.getMainSections()
.get(8).getChildren()
.get(2).getChildren()
.get(0).getChildren()
.get(2).getImages().size(), 1);
Document document = buildGraph(fileName, classificationDocument);
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
assertTrue(tableOfContents.getAllTableOfContentItems()
.stream()
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
.stream()
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
.toList(),
outlineObjectTree.getRootNodes()
.stream()
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
.toList());
Predicate<SemanticNode> isSectionOrSuperSection = semanticNode -> semanticNode instanceof Section || semanticNode instanceof SuperSection;
assertEquals(childrenOfTypeSectionOrSuperSection.get(5).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 6 + 1); // 1 additional for main text of parent section
assertEquals(childrenOfTypeSectionOrSuperSection.get(7).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 1 + 1);
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(3).streamChildren()
.filter(isSectionOrSuperSection)
.toList()
.get(1).streamChildren()
.filter(isSectionOrSuperSection)
.count(), 3 + 1);
List<List<Integer>> imageTreeIdList = document.streamAllImages()
.map(image -> image.getParent().getTreeId())
.toList();
assertEquals(imageTreeIdList.get(0), List.of(0));
assertEquals(imageTreeIdList.get(1), List.of(6));
assertEquals(imageTreeIdList.get(2), List.of(8, 4, 2, 4));
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
private static String sanitizeString(String text) {
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
}
@SneakyThrows
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
File fileResource = new ClassPathResource(filename).getFile();
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/crafted_outline_test_doc.IMAGE_INFO.json");
return layoutParsingPipeline.parseLayout(layoutParsingType,
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@SneakyThrows
protected Document buildGraph(String filename, ClassificationDocument classificationDocument) {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
new File(filename),
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {
prepareStorage(filename);
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
}
}
}

View File

@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/new/crafted_outline_test_doc.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";

View File

@ -48,6 +48,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, new File(filename));

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8afb731a307e1a3f827c59e902164b10bdabef96e14193b949fe081cd3aa859f
size 168878

View File

@ -0,0 +1,135 @@
{
"dossierId": "a91f19ff-11ba-4735-9f60-c650243f64a9",
"fileId": "6e8c5f114e2b71e103a32a20c5273188",
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": "IMAGE_INFO.json.gz",
"X-TENANT-ID": "redaction",
"data": [
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 61,
"x2": 394,
"y1": 155,
"y2": 470,
"pageNumber": 1
},
"geometry": {
"width": 333,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5976,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.0571,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 61,
"x2": 394,
"y1": 202,
"y2": 517,
"pageNumber": 11
},
"geometry": {
"width": 333,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5976,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.0571,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"label": "other",
"probabilities": {
"other": 0.9126,
"formula": 0.0588,
"signature": 0.0261,
"logo": 0.0024
}
},
"representation": "70E1070C1030E081B7EF7FFFF",
"position": {
"x1": 47,
"x2": 379,
"y1": 289,
"y2": 604,
"pageNumber": 16
},
"geometry": {
"width": 332,
"height": 315
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.5967,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.054,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
}
]
}