RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
59d9d6c3e6
commit
17756f5977
@ -31,7 +31,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -44,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
@ -231,14 +231,15 @@ public class LayoutParsingPipeline {
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
|
||||
OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument);
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
// parsing the structure elements could be useful as well
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
@ -296,10 +297,11 @@ public class LayoutParsingPipeline {
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
};
|
||||
|
||||
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
|
||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
|
||||
.get(pageNumber - 1);
|
||||
if(outlineObjects != null) {
|
||||
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage, outlineObjects);
|
||||
if (outlineObjects != null) {
|
||||
classificationPage.setOutlineObjects(outlineObjects);
|
||||
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage);
|
||||
}
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -361,7 +363,6 @@ public class LayoutParsingPipeline {
|
||||
.toList();
|
||||
// ???
|
||||
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
@ -26,4 +27,6 @@ public class ClassificationDocument {
|
||||
|
||||
private long rulesVersion;
|
||||
|
||||
private OutlineObjectTree outlineObjectTree;
|
||||
|
||||
}
|
||||
|
||||
@ -8,13 +8,13 @@ import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
@ -23,6 +23,10 @@ public class ClassificationPage {
|
||||
@NonNull
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
@ -74,6 +74,8 @@ public class OutlineExtractorService {
|
||||
}
|
||||
|
||||
|
||||
// if the structure elements are processed beforehand, another case can be handled here as well:
|
||||
// outline objects can reference structure elements (see pdf documentation)
|
||||
@SneakyThrows
|
||||
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
@ -39,11 +39,4 @@ public class OutlineObjectTree {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,210 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
|
||||
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
TableOfContents validatedToC = createToC(headlinesFromOutlines);
|
||||
TableOfContents currentToC = createToC(allHeadlines);
|
||||
|
||||
TableOfContentItem lastHeadlineFromOutlines = null;
|
||||
for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) {
|
||||
if (!containsItem(validatedToC, tocItem)) {
|
||||
addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines);
|
||||
} else {
|
||||
lastHeadlineFromOutlines = tocItem;
|
||||
}
|
||||
}
|
||||
return validatedToC;
|
||||
}
|
||||
|
||||
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
|
||||
|
||||
if(!tocItem.getChildren().isEmpty()) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
int parentDepth = 7; // more than 6 (h6)
|
||||
TableOfContentItem parent = null;
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getDepth(current.getClassification());
|
||||
if(parentDepth >= currentDepth) {
|
||||
parentDepth = currentDepth;
|
||||
parent = new TableOfContentItem(current);
|
||||
mainSections.add(parent);
|
||||
} else {
|
||||
assert (parent!=null);
|
||||
while(parentDepth < currentDepth && parent.getParent() != null) {
|
||||
parent = parent.getParent();
|
||||
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
|
||||
}
|
||||
parent.addChild(new TableOfContentItem(current));
|
||||
}
|
||||
}
|
||||
return new TableOfContents(mainSections);
|
||||
|
||||
}
|
||||
|
||||
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
|
||||
.map(textPageBlock -> new OutlineObject(textPageBlock.getText(),
|
||||
textPageBlock.getPage(),
|
||||
new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()),
|
||||
getDepth(textPageBlock.getClassification())))
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int getDepth(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1 -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public void validate(List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) {
|
||||
|
||||
List<Headline> headlines = allHeadlines.stream()
|
||||
.map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock)))
|
||||
.toList();
|
||||
for (TextPageBlock newHeadline : newlyClassifiedHeadlines) {
|
||||
int newHeadlineIndex = headlines.indexOf(newHeadline);
|
||||
List<TextPageBlock> adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines);
|
||||
// Find neighboring headlines from outlines
|
||||
//TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1);
|
||||
//TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1);
|
||||
|
||||
// If we have neighboring outlines, perform comparison
|
||||
//if (previousOutline != null && nextOutline != null) {
|
||||
// // Compare headline orders
|
||||
// int orderComparison = compareHeadlineOrder(previousOutline, nextOutline);
|
||||
// if (orderComparison != 0) {
|
||||
// // Set classification based on comparison
|
||||
// setClassification(newHeadline, orderComparison, previousOutline, nextOutline);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findAdjacentNewlyClassified(TextPageBlock headline, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
// Find adjacent newly classified headlines
|
||||
List<TextPageBlock> adjacentNewlyClassified = new ArrayList<>();
|
||||
int index = newlyClassifiedHeadlines.indexOf(headline);
|
||||
if (index != -1) {
|
||||
adjacentNewlyClassified.add(headline);
|
||||
for (int i = index - 1; i >= 0; i--) {
|
||||
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) {
|
||||
adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) {
|
||||
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) {
|
||||
adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return adjacentNewlyClassified;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock findNeighboringOutline(TextPageBlock headline, List<TextPageBlock> headlinesFromOutlines, int direction) {
|
||||
// Find neighboring headline from outlines in the specified direction
|
||||
int index = headlinesFromOutlines.indexOf(headline);
|
||||
if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) {
|
||||
return headlinesFromOutlines.get(index + direction);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) {
|
||||
// Compare headline orders
|
||||
// Implement your comparison logic here
|
||||
return 0; // Placeholder return, implement actual comparison logic
|
||||
}
|
||||
|
||||
|
||||
private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) {
|
||||
// Set classification based on comparison with neighboring outlines
|
||||
// Implement your classification logic here
|
||||
}
|
||||
|
||||
|
||||
record Headline(TextPageBlock textPageBlock, boolean newlyClassified) {
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null || getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
Headline headline = (Headline) obj;
|
||||
return Objects.equals(textPageBlock, headline.textPageBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(textPageBlock);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,93 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class TableOfContentItem {
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private TextPageBlock textPageBlock;
|
||||
private List<TableOfContentItem> children = new ArrayList<>();
|
||||
private TableOfContentItem parent;
|
||||
|
||||
|
||||
public TableOfContentItem(TextPageBlock textPageBlock) {
|
||||
|
||||
this.textPageBlock = textPageBlock;
|
||||
}
|
||||
|
||||
|
||||
public void addChild(TableOfContentItem tableOfContentItem) {
|
||||
|
||||
children.add(tableOfContentItem);
|
||||
tableOfContentItem.setParent(this);
|
||||
}
|
||||
|
||||
|
||||
public TableOfContentItem getSiblingBefore() {
|
||||
|
||||
try {
|
||||
return parent.getChildren()
|
||||
.get(parent.getChildren().indexOf(this) - 1);
|
||||
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
public TableOfContentItem getSiblingAfter() {
|
||||
|
||||
try {
|
||||
return parent.getChildren()
|
||||
.get(parent.getChildren().indexOf(this) + 1);
|
||||
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TextPageBlock block) {
|
||||
|
||||
boolean anyChildContains = false;
|
||||
if (!children.isEmpty()) {
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.getTextPageBlock().equals(block)) {
|
||||
return true;
|
||||
} else {
|
||||
anyChildContains = anyChildContains || child.contains(block);
|
||||
}
|
||||
}
|
||||
}
|
||||
return anyChildContains;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TableOfContentItem tocItem) {
|
||||
|
||||
boolean anyChildContains = false;
|
||||
if (!children.isEmpty()) {
|
||||
for (TableOfContentItem child : children) {
|
||||
if (child.equals(tocItem)) {
|
||||
return true;
|
||||
} else {
|
||||
anyChildContains = anyChildContains || child.contains(tocItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
return anyChildContains;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}';
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,59 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class TableOfContents {
|
||||
|
||||
private List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
|
||||
|
||||
public TableOfContents(List<TableOfContentItem> mainSections) {
|
||||
|
||||
this.mainSections = mainSections;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPageBlock> getAllTextPageBlocks() {
|
||||
|
||||
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
||||
for (TableOfContentItem item : mainSections) {
|
||||
collectTextPageBlocks(item, allTextPageBlocks);
|
||||
}
|
||||
return allTextPageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
textPageBlocks.add(item.getTextPageBlock());
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTextPageBlocks(child, textPageBlocks);
|
||||
}
|
||||
}
|
||||
|
||||
public List<TableOfContentItem> getAllTableOfContentItems() {
|
||||
List<TableOfContentItem> allItems = new ArrayList<>();
|
||||
for (TableOfContentItem item : mainSections) {
|
||||
collectTableOfContentItems(item, allItems);
|
||||
}
|
||||
return allItems;
|
||||
}
|
||||
|
||||
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
|
||||
allItems.add(item);
|
||||
for (TableOfContentItem child : item.getChildren()) {
|
||||
collectTableOfContentItems(child, allItems);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -36,8 +36,9 @@ public class BlockificationPostprocessingService {
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
@ -244,6 +245,13 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
// currently only three cases are handled here:
|
||||
// 1. equality
|
||||
// 2. outline title contains block text
|
||||
// 3. block text contains outline title
|
||||
// another possible case is an intersection, meaning a title is split up between two different blocks
|
||||
// this should not happen with how docstrum creates the blocks
|
||||
// if it is indeed necessary, a splitting has to be done with a follow-up merge
|
||||
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
|
||||
@ -1,8 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -22,15 +27,43 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final OutlineValidationService outlineValidationService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
List<TextPageBlock> headlinesFromOutlines = document.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
|
||||
List<TextPageBlock> allHeadlines = document.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
|
||||
newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
|
||||
|
||||
TableOfContents toC = outlineValidationService.createToC(allHeadlines);
|
||||
System.out.println(toC);
|
||||
|
||||
outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -48,7 +81,7 @@ public class RedactManagerClassificationService {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
@ -62,33 +95,30 @@ public class RedactManagerClassificationService {
|
||||
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
} if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
.getCountPerValue()
|
||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
||||
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
||||
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
||||
&& textBlock.getSequences()
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
@ -96,25 +126,31 @@ public class RedactManagerClassificationService {
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
} else if (!textBlock.getText().startsWith("Figure ")
|
||||
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||
&& textBlock.getSequences()
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
|
||||
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user