RED-7074: Design Subsection section tree structure algorithm

* first draft
This commit is contained in:
maverickstuder 2024-04-09 16:53:57 +02:00
parent 9bd8419770
commit 7f675b41cf
9 changed files with 300 additions and 41 deletions

View File

@ -28,6 +28,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -40,10 +43,12 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -85,11 +90,13 @@ public class LayoutParsingPipeline {
TableExtractionService tableExtractionService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
BlockificationPostprocessingService blockificationPostprocessingService;
DocstrumBlockificationService docstrumBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
OutlineExtractorService outlineExtractorService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -98,29 +105,36 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -152,25 +166,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
@ -191,14 +205,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -213,6 +227,9 @@ public class LayoutParsingPipeline {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument);
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
@ -264,6 +281,12 @@ public class LayoutParsingPipeline {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
};
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
.get(pageNumber - 1);
if(outlineObjects != null) {
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, outlineObjects);
}
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class OutlineObject {
private String title;
private int pageNumber;
//private Point2D point;
private int treeDepth;
@Override
public String toString() {
return "OutlineObject{" + "title='" + title + '\'' + '}';
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class OutlineObjectTree {
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
this.rootNodes = rootNodes;
flattenNodesAndGroupByPage(rootNodes);
}
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
int pageNumber = node.getOutlineObject().getPageNumber();
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
}
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
if (!node.getChildren().isEmpty()) {
flattenNodesAndGroupByPage(node.getChildren());
}
}
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -0,0 +1,37 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
@Data
public class OutlineObjectTreeNode {
private OutlineObject outlineObject;
private List<OutlineObjectTreeNode> children = new ArrayList<>();
public OutlineObjectTreeNode(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
}
public void addChild(OutlineObjectTreeNode outlineObject) {
children.add(outlineObject);
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
}
}

View File

@ -0,0 +1,70 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
import lombok.SneakyThrows;
@Service
public class OutlineExtractorService {
@SneakyThrows
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
for (PDOutlineItem child : documentOutline.children()) {
OutlineObjectTreeNode outlineObject = createOutlineObjectWithChildren(child, document, 1);
rootNodes.add(outlineObject);
}
return new OutlineObjectTree(rootNodes);
}
@SneakyThrows
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
for (var child : item.children()) {
outlineObject.addChild(createOutlineObjectWithChildren(child, document, depth + 1));
}
return outlineObject;
}
@SneakyThrows
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
String title = item.getTitle();
PDPage page = item.findDestinationPage(document);
int pageNumber = document.getPages().indexOf(page);
//float x = 0;
//float y = 0;
//COSDictionary cosObject = item.getAction().getCOSObject();
// if (cosObject.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto")) {
// COSArray cosArray = cosObject.getCOSArray(COSName.D);
// x = ((COSInteger)cosArray.get(2)).floatValue();
// y = ((COSInteger)cosArray.get(3)).floatValue();
//
// }
//return new OutlineObject(title, pageNumber, new Point2D.Float(x, y));
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, depth));
}
}

View File

@ -0,0 +1,46 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
@Service
public class BlockificationPostprocessingService {
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
for (OutlineObject outlineObject : outlineObjects) {
String blockText = textBlock.getText();
String outlineTitle = outlineObject.getTitle();
if (!blockText.contains(outlineTitle)) {
continue;
}
if (blockText.equals(outlineTitle)) {
textBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
continue;
}
splitTextBlock(textBlock, outlineTitle, classificationPage);
}
}
}
private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) {
}
}

View File

@ -59,6 +59,9 @@ public class DocuMineClassificationService {
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;

View File

@ -48,6 +48,9 @@ public class RedactManagerClassificationService {
var bodyTextFrame = page.getBodyTextFrame();
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;

View File

@ -27,7 +27,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/$100m Offers.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -35,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}