RED-7074: Design Subsection section tree structure algorithm
* first draft
This commit is contained in:
parent
9bd8419770
commit
7f675b41cf
@ -28,6 +28,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -40,10 +43,12 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
@ -85,11 +90,13 @@ public class LayoutParsingPipeline {
|
|||||||
TableExtractionService tableExtractionService;
|
TableExtractionService tableExtractionService;
|
||||||
DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
|
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||||
DocstrumBlockificationService docstrumBlockificationService;
|
DocstrumBlockificationService docstrumBlockificationService;
|
||||||
LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
ClarifyndClassificationService clarifyndClassificationService;
|
ClarifyndClassificationService clarifyndClassificationService;
|
||||||
|
OutlineExtractorService outlineExtractorService;
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
@ -98,29 +105,36 @@ public class LayoutParsingPipeline {
|
|||||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||||
|
.orElse(originFile);
|
||||||
|
|
||||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
.isPresent()) {
|
||||||
|
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.imagesFileStorageId()
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId()
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||||
originFile,
|
originFile,
|
||||||
imageServiceResponse,
|
imageServiceResponse,
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
visualLayoutParsingResponse,
|
visualLayoutParsingResponse,
|
||||||
layoutParsingRequest.identifier());
|
layoutParsingRequest.identifier());
|
||||||
|
|
||||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
@ -152,25 +166,25 @@ public class LayoutParsingPipeline {
|
|||||||
.numberOfPages(documentGraph.getNumberOfPages())
|
.numberOfPages(documentGraph.getNumberOfPages())
|
||||||
.duration(System.currentTimeMillis() - start)
|
.duration(System.currentTimeMillis() - start)
|
||||||
.message(format("""
|
.message(format("""
|
||||||
Layout parsing has finished in %.02f s.
|
Layout parsing has finished in %.02f s.
|
||||||
identifiers: %s
|
identifiers: %s
|
||||||
%s
|
%s
|
||||||
Files have been saved with Ids:
|
Files have been saved with Ids:
|
||||||
Structure: %s
|
Structure: %s
|
||||||
Text: %s
|
Text: %s
|
||||||
Positions: %s
|
Positions: %s
|
||||||
PageData: %s
|
PageData: %s
|
||||||
Simplified Text: %s
|
Simplified Text: %s
|
||||||
Viewer Doc: %s""",
|
Viewer Doc: %s""",
|
||||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
layoutParsingRequest.identifier(),
|
layoutParsingRequest.identifier(),
|
||||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
layoutParsingRequest.pageFileStorageId(),
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
layoutParsingRequest.simplifiedTextStorageId(),
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
layoutParsingRequest.viewerDocumentStorageId()))
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -191,14 +205,14 @@ public class LayoutParsingPipeline {
|
|||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -213,6 +227,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
PDDocument originDocument = openDocument(originFile);
|
PDDocument originDocument = openDocument(originFile);
|
||||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||||
|
|
||||||
|
OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument);
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
@ -264,6 +281,12 @@ public class LayoutParsingPipeline {
|
|||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
|
||||||
|
.get(pageNumber - 1);
|
||||||
|
if(outlineObjects != null) {
|
||||||
|
blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, outlineObjects);
|
||||||
|
}
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
classificationPage.setRotation(rotation);
|
classificationPage.setRotation(rotation);
|
||||||
classificationPage.setLandscape(isLandscape);
|
classificationPage.setLandscape(isLandscape);
|
||||||
|
|||||||
@ -0,0 +1,25 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class OutlineObject {
|
||||||
|
|
||||||
|
private String title;
|
||||||
|
private int pageNumber;
|
||||||
|
//private Point2D point;
|
||||||
|
private int treeDepth;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,49 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class OutlineObjectTree {
|
||||||
|
|
||||||
|
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||||
|
|
||||||
|
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
|
||||||
|
|
||||||
|
this.rootNodes = rootNodes;
|
||||||
|
flattenNodesAndGroupByPage(rootNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
|
||||||
|
|
||||||
|
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
|
||||||
|
int pageNumber = node.getOutlineObject().getPageNumber();
|
||||||
|
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
|
||||||
|
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
|
||||||
|
}
|
||||||
|
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
|
||||||
|
|
||||||
|
if (!node.getChildren().isEmpty()) {
|
||||||
|
flattenNodesAndGroupByPage(node.getChildren());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return super.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class OutlineObjectTreeNode {
|
||||||
|
|
||||||
|
private OutlineObject outlineObject;
|
||||||
|
|
||||||
|
private List<OutlineObjectTreeNode> children = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObjectTreeNode(OutlineObject outlineObject) {
|
||||||
|
|
||||||
|
this.outlineObject = outlineObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addChild(OutlineObjectTreeNode outlineObject) {
|
||||||
|
|
||||||
|
children.add(outlineObject);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,70 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class OutlineExtractorService {
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||||
|
|
||||||
|
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||||
|
|
||||||
|
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||||
|
for (PDOutlineItem child : documentOutline.children()) {
|
||||||
|
OutlineObjectTreeNode outlineObject = createOutlineObjectWithChildren(child, document, 1);
|
||||||
|
rootNodes.add(outlineObject);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new OutlineObjectTree(rootNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
|
OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth);
|
||||||
|
for (var child : item.children()) {
|
||||||
|
outlineObject.addChild(createOutlineObjectWithChildren(child, document, depth + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
return outlineObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
|
String title = item.getTitle();
|
||||||
|
|
||||||
|
PDPage page = item.findDestinationPage(document);
|
||||||
|
int pageNumber = document.getPages().indexOf(page);
|
||||||
|
|
||||||
|
//float x = 0;
|
||||||
|
//float y = 0;
|
||||||
|
//COSDictionary cosObject = item.getAction().getCOSObject();
|
||||||
|
// if (cosObject.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto")) {
|
||||||
|
// COSArray cosArray = cosObject.getCOSArray(COSName.D);
|
||||||
|
// x = ((COSInteger)cosArray.get(2)).floatValue();
|
||||||
|
// y = ((COSInteger)cosArray.get(3)).floatValue();
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
//return new OutlineObject(title, pageNumber, new Point2D.Float(x, y));
|
||||||
|
|
||||||
|
return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, depth));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,46 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class BlockificationPostprocessingService {
|
||||||
|
|
||||||
|
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||||
|
|
||||||
|
|
||||||
|
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||||
|
for (OutlineObject outlineObject : outlineObjects) {
|
||||||
|
|
||||||
|
String blockText = textBlock.getText();
|
||||||
|
String outlineTitle = outlineObject.getTitle();
|
||||||
|
|
||||||
|
if (!blockText.contains(outlineTitle)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockText.equals(outlineTitle)) {
|
||||||
|
|
||||||
|
textBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
splitTextBlock(textBlock, outlineTitle, classificationPage);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void splitTextBlock(AbstractPageBlock textBlock, String title, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -59,6 +59,9 @@ public class DocuMineClassificationService {
|
|||||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||||
|
|
||||||
|
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -48,6 +48,9 @@ public class RedactManagerClassificationService {
|
|||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
|
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -27,7 +27,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/new/ScrambledTextAfterSorting.pdf";
|
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||||
|
//String fileName = "files/new/$100m Offers.pdf";
|
||||||
|
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||||
|
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -35,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user