Merge branch 'RED-7074-test' into 'main'
RED-7074: Design Subsection section tree structure algorithm See merge request fforesight/layout-parser!162
This commit is contained in:
commit
c3edeb3c7d
@ -43,6 +43,11 @@ public class Document extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the sections of the document as a list.
|
||||
*
|
||||
* @return A list of all sections within the document.
|
||||
*/
|
||||
public List<Section> getAllSections() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node)
|
||||
@ -50,6 +55,34 @@ public class Document extends AbstractSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the main sections of the document as a list.
|
||||
*
|
||||
* @return A list of main sections within the document
|
||||
* @deprecated This method is marked for removal.
|
||||
* Use {@link #streamChildrenOfType(NodeType)} instead,
|
||||
* or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION.
|
||||
*/
|
||||
@Deprecated(forRemoval = true)
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects.
|
||||
*
|
||||
* @return A list of all children of type SECTION or SUPER_SECTION.
|
||||
*/
|
||||
public List<SemanticNode> getChildrenOfTypeSectionOrSuperSection() {
|
||||
|
||||
return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public List<Header> getHeaders() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node)
|
||||
|
||||
@ -29,7 +29,7 @@ public class TOCEnrichmentService {
|
||||
TableOfContents toc = document.getTableOfContents();
|
||||
Iterator<TableOfContentItem> iterator = toc.iterator();
|
||||
TableOfContentItem currentTOCItem = null;
|
||||
if(iterator.hasNext()) {
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
@ -79,7 +79,7 @@ public class TOCEnrichmentService {
|
||||
currentTOCItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(currentTOCItem);
|
||||
|
||||
if(iterator.hasNext()) {
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
} else if (!foundFirstHeadline) {
|
||||
@ -149,9 +149,9 @@ public class TOCEnrichmentService {
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses first paragraph");
|
||||
log.debug("Image uses last found section");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(0).getImages().add(image);
|
||||
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
|
||||
@ -0,0 +1,231 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class OutlineDetectionTest extends AbstractTest {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
@Autowired
|
||||
protected LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testOutlinesToSections() {
|
||||
|
||||
String fileName = "files/new/crafted_outline_test_doc.pdf";
|
||||
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
|
||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(1).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(3).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(4).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(5).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(6).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(7).size(), 3);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(8).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(10).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(11).size(), 4);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(12).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(13).size(), 2);
|
||||
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.allMatch(OutlineObject::isFound));
|
||||
|
||||
TableOfContents tableOfContents = classificationDocument.getTableOfContents();
|
||||
|
||||
assertEquals(tableOfContents.getMainSections().size(), 9);
|
||||
assertEquals(tableOfContents.getMainSections().subList(1, 9)
|
||||
.stream()
|
||||
.map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString()))
|
||||
.toList(),
|
||||
outlineObjectTree.getRootNodes()
|
||||
.stream()
|
||||
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
|
||||
.toList());
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(5).getChildren().size(), 6);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(7).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren()
|
||||
.get(0).getChildren().size(), 3);
|
||||
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(0).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(6).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren()
|
||||
.get(0).getChildren()
|
||||
.get(2).getImages().size(), 1);
|
||||
|
||||
Document document = buildGraph(fileName, classificationDocument);
|
||||
|
||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
||||
.stream()
|
||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null));
|
||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
||||
.stream()
|
||||
.filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty())
|
||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section));
|
||||
assertTrue(tableOfContents.getAllTableOfContentItems()
|
||||
.stream()
|
||||
.filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty())
|
||||
.allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection));
|
||||
|
||||
List<SemanticNode> childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection();
|
||||
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9)
|
||||
.stream()
|
||||
.map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString()))
|
||||
.toList(),
|
||||
outlineObjectTree.getRootNodes()
|
||||
.stream()
|
||||
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
|
||||
.toList());
|
||||
Predicate<SemanticNode> isSectionOrSuperSection = semanticNode -> semanticNode instanceof Section || semanticNode instanceof SuperSection;
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(5).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 6 + 1); // 1 additional for main text of parent section
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(7).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 3 + 1);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 3 + 1);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(3).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 1 + 1);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(3).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(1).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 3 + 1);
|
||||
|
||||
List<List<Integer>> imageTreeIdList = document.streamAllImages()
|
||||
.map(image -> image.getParent().getTreeId())
|
||||
.toList();
|
||||
|
||||
assertEquals(imageTreeIdList.get(0), List.of(0));
|
||||
assertEquals(imageTreeIdList.get(1), List.of(6));
|
||||
assertEquals(imageTreeIdList.get(2), List.of(8, 4, 2, 4));
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
private static String sanitizeString(String text) {
|
||||
|
||||
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
File fileResource = new ClassPathResource(filename).getFile();
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/crafted_outline_test_doc.IMAGE_INFO.json");
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename, ClassificationDocument classificationDocument) {
|
||||
|
||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
||||
prepareStorage(layoutParsingRequest, new File(filename));
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
new File(filename),
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/new/crafted_outline_test_doc.pdf";
|
||||
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
|
||||
@ -48,6 +48,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
|
||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
||||
prepareStorage(layoutParsingRequest, new File(filename));
|
||||
|
||||
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8afb731a307e1a3f827c59e902164b10bdabef96e14193b949fe081cd3aa859f
|
||||
size 168878
|
||||
@ -0,0 +1,135 @@
|
||||
{
|
||||
"dossierId": "a91f19ff-11ba-4735-9f60-c650243f64a9",
|
||||
"fileId": "6e8c5f114e2b71e103a32a20c5273188",
|
||||
"targetFileExtension": "ORIGIN.pdf.gz",
|
||||
"responseFileExtension": "IMAGE_INFO.json.gz",
|
||||
"X-TENANT-ID": "redaction",
|
||||
"data": [
|
||||
{
|
||||
"classification": {
|
||||
"label": "other",
|
||||
"probabilities": {
|
||||
"other": 0.9126,
|
||||
"formula": 0.0588,
|
||||
"signature": 0.0261,
|
||||
"logo": 0.0024
|
||||
}
|
||||
},
|
||||
"representation": "70E1070C1030E081B7EF7FFFF",
|
||||
"position": {
|
||||
"x1": 61,
|
||||
"x2": 394,
|
||||
"y1": 155,
|
||||
"y2": 470,
|
||||
"pageNumber": 1
|
||||
},
|
||||
"geometry": {
|
||||
"width": 333,
|
||||
"height": 315
|
||||
},
|
||||
"alpha": false,
|
||||
"filters": {
|
||||
"geometry": {
|
||||
"imageSize": {
|
||||
"quotient": 0.5976,
|
||||
"tooLarge": false,
|
||||
"tooSmall": false
|
||||
},
|
||||
"imageFormat": {
|
||||
"quotient": 1.0571,
|
||||
"tooTall": false,
|
||||
"tooWide": false
|
||||
}
|
||||
},
|
||||
"probability": {
|
||||
"unconfident": false
|
||||
},
|
||||
"allPassed": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"classification": {
|
||||
"label": "other",
|
||||
"probabilities": {
|
||||
"other": 0.9126,
|
||||
"formula": 0.0588,
|
||||
"signature": 0.0261,
|
||||
"logo": 0.0024
|
||||
}
|
||||
},
|
||||
"representation": "70E1070C1030E081B7EF7FFFF",
|
||||
"position": {
|
||||
"x1": 61,
|
||||
"x2": 394,
|
||||
"y1": 202,
|
||||
"y2": 517,
|
||||
"pageNumber": 11
|
||||
},
|
||||
"geometry": {
|
||||
"width": 333,
|
||||
"height": 315
|
||||
},
|
||||
"alpha": false,
|
||||
"filters": {
|
||||
"geometry": {
|
||||
"imageSize": {
|
||||
"quotient": 0.5976,
|
||||
"tooLarge": false,
|
||||
"tooSmall": false
|
||||
},
|
||||
"imageFormat": {
|
||||
"quotient": 1.0571,
|
||||
"tooTall": false,
|
||||
"tooWide": false
|
||||
}
|
||||
},
|
||||
"probability": {
|
||||
"unconfident": false
|
||||
},
|
||||
"allPassed": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"classification": {
|
||||
"label": "other",
|
||||
"probabilities": {
|
||||
"other": 0.9126,
|
||||
"formula": 0.0588,
|
||||
"signature": 0.0261,
|
||||
"logo": 0.0024
|
||||
}
|
||||
},
|
||||
"representation": "70E1070C1030E081B7EF7FFFF",
|
||||
"position": {
|
||||
"x1": 47,
|
||||
"x2": 379,
|
||||
"y1": 289,
|
||||
"y2": 604,
|
||||
"pageNumber": 16
|
||||
},
|
||||
"geometry": {
|
||||
"width": 332,
|
||||
"height": 315
|
||||
},
|
||||
"alpha": false,
|
||||
"filters": {
|
||||
"geometry": {
|
||||
"imageSize": {
|
||||
"quotient": 0.5967,
|
||||
"tooLarge": false,
|
||||
"tooSmall": false
|
||||
},
|
||||
"imageFormat": {
|
||||
"quotient": 1.054,
|
||||
"tooTall": false,
|
||||
"tooWide": false
|
||||
}
|
||||
},
|
||||
"probability": {
|
||||
"unconfident": false
|
||||
},
|
||||
"allPassed": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user