RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
This commit is contained in:
maverickstuder 2024-04-19 11:31:34 +02:00
parent 77ee8dd5bd
commit 09148960cf
6 changed files with 81 additions and 52 deletions

View File

@ -28,9 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -43,7 +44,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -97,6 +97,7 @@ public class LayoutParsingPipeline {
VisualLayoutParsingAdapter visualLayoutParsingAdapter; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService; ClarifyndClassificationService clarifyndClassificationService;
OutlineExtractorService outlineExtractorService; OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -339,14 +340,15 @@ public class LayoutParsingPipeline {
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
} }
// compute ToC List<TextPageBlock> headlines = classificationDocument.getPages()
List<AbstractPageBlock> headlines = classificationDocument.getPages()
.stream() .stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks() .flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream() .stream()
.filter(tb -> tb.getClassification().isHeadline())) .filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList(); .toList();
// ??? TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);

View File

@ -4,6 +4,7 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
@ -28,5 +29,6 @@ public class ClassificationDocument {
private long rulesVersion; private long rulesVersion;
private OutlineObjectTree outlineObjectTree; private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents;
} }

View File

@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.TreeSet;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -16,8 +19,8 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class OutlineValidationService { public class OutlineValidationService {
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) { public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
TableOfContents validatedToC = createToC(headlinesFromOutlines); TableOfContents validatedToC = createToC(headlinesFromOutlines);
TableOfContents currentToC = createToC(allHeadlines); TableOfContents currentToC = createToC(allHeadlines);
@ -32,7 +35,9 @@ public class OutlineValidationService {
return validatedToC; return validatedToC;
} }
private boolean containsBlock(TableOfContents toc, TextPageBlock block) { private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
for (TableOfContentItem existingItem : toc.getMainSections()) { for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) { if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
return true; return true;
@ -41,7 +46,9 @@ public class OutlineValidationService {
return false; return false;
} }
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) { private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : toc.getMainSections()) { for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true; return true;
@ -50,6 +57,7 @@ public class OutlineValidationService {
return false; return false;
} }
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) { private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
//if (lastHeadlineFromOutlines == null || tocItem.g) //if (lastHeadlineFromOutlines == null || tocItem.g)
@ -58,20 +66,21 @@ public class OutlineValidationService {
//} //}
} }
public TableOfContents createToC(List<TextPageBlock> headlines) {
public TableOfContents createToCOld(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>(); List<TableOfContentItem> mainSections = new ArrayList<>();
int parentDepth = 7; // more than 6 (h6) int parentDepth = 7; // more than 6 (h6)
TableOfContentItem parent = null; TableOfContentItem parent = null;
for (TextPageBlock current : headlines) { for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification()); int currentDepth = getDepth(current.getClassification());
if(parentDepth >= currentDepth) { if (parentDepth >= currentDepth) {
parentDepth = currentDepth; parentDepth = currentDepth;
parent = new TableOfContentItem(current); parent = new TableOfContentItem(current);
mainSections.add(parent); mainSections.add(parent);
} else { } else {
assert (parent!=null); assert (parent != null);
while(parentDepth < currentDepth && parent.getParent() != null) { while (parentDepth < currentDepth && parent.getParent() != null) {
parent = parent.getParent(); parent = parent.getParent();
parentDepth = getDepth(parent.getTextPageBlock().getClassification()); parentDepth = getDepth(parent.getTextPageBlock().getClassification());
} }
@ -82,6 +91,46 @@ public class OutlineValidationService {
} }
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
if (parentDepth == null) {
mainSections.add(tocItem);
} else {
assert last != null;
int lastDepth = getDepth(last.getTextPageBlock().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
}
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) { public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream() List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()

View File

@ -131,7 +131,7 @@ public class BlockificationPostprocessingService {
} else if (minDistance == distanceToSplitCandidate) { } else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
splitCandidate.setClassification(headlineType); splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(headlineType)); others.forEach(other -> other.setClassification(null));
} else { } else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType); merged.setClassification(headlineType);

View File

@ -5,10 +5,8 @@ import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.ListIterator; import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -100,7 +98,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) { while (itty.hasNext()) {
AbstractPageBlock block = itty.next(); AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) { if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) {
previous = new TextPageBlock(); previous = new TextPageBlock();
continue; continue;
} }
@ -230,21 +228,31 @@ public class DocstrumBlockificationService {
continue; continue;
} }
if(block.getClassification() != null && block.getClassification().isHeadline()) {
continue;
}
TextPageBlock current = (TextPageBlock) block; TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) { for (int i = 0; i < blocks.size(); i++) {
if(blocks.get(i) == null){ AbstractPageBlock abstractPageBlock = blocks.get(i);
if(abstractPageBlock == null){
continue; continue;
} }
if (blocks.get(i) == current) { if (abstractPageBlock == current) {
continue; continue;
} }
if (blocks.get(i) instanceof TablePageBlock) { if (abstractPageBlock instanceof TablePageBlock) {
continue; continue;
} }
TextPageBlock inner = (TextPageBlock) blocks.get(i); if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {

View File

@ -1,13 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification; package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -15,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.Data; import lombok.Data;
@ -26,8 +22,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactManagerClassificationService { public class RedactManagerClassificationService {
private final OutlineValidationService outlineValidationService;
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
@ -35,36 +29,10 @@ public class RedactManagerClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
List<TextPageBlock> headlinesFromOutlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext(); HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
for (ClassificationPage page : document.getPages()) { for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes, headLineClassificationContext); classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
} }
List<TextPageBlock> allHeadlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
TableOfContents toC = outlineValidationService.createToC(allHeadlines);
System.out.println(toC);
outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
} }