RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
This commit is contained in:
maverickstuder 2024-04-19 11:31:34 +02:00
parent 77ee8dd5bd
commit 09148960cf
6 changed files with 81 additions and 52 deletions

View File

@ -28,9 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -43,7 +44,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -97,6 +97,7 @@ public class LayoutParsingPipeline {
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService;
OutlineExtractorService outlineExtractorService;
OutlineValidationService outlineValidationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -339,14 +340,15 @@ public class LayoutParsingPipeline {
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}
// compute ToC
List<AbstractPageBlock> headlines = classificationDocument.getPages()
List<TextPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb.getClassification().isHeadline()))
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
// ???
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
classificationDocument.setTableOfContents(tableOfContents);
log.info("Building Sections for {}", identifier);

View File

@ -4,6 +4,7 @@ import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
@ -28,5 +29,6 @@ public class ClassificationDocument {
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private TableOfContents tableOfContents;
}

View File

@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
@ -16,8 +19,8 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class OutlineValidationService {
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
TableOfContents validatedToC = createToC(headlinesFromOutlines);
TableOfContents currentToC = createToC(allHeadlines);
@ -32,7 +35,9 @@ public class OutlineValidationService {
return validatedToC;
}
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
return true;
@ -41,7 +46,9 @@ public class OutlineValidationService {
return false;
}
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
@ -50,6 +57,7 @@ public class OutlineValidationService {
return false;
}
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
//if (lastHeadlineFromOutlines == null || tocItem.g)
@ -58,20 +66,21 @@ public class OutlineValidationService {
//}
}
public TableOfContents createToC(List<TextPageBlock> headlines) {
public TableOfContents createToCOld(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
int parentDepth = 7; // more than 6 (h6)
TableOfContentItem parent = null;
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
if(parentDepth >= currentDepth) {
if (parentDepth >= currentDepth) {
parentDepth = currentDepth;
parent = new TableOfContentItem(current);
mainSections.add(parent);
} else {
assert (parent!=null);
while(parentDepth < currentDepth && parent.getParent() != null) {
assert (parent != null);
while (parentDepth < currentDepth && parent.getParent() != null) {
parent = parent.getParent();
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
}
@ -82,6 +91,46 @@ public class OutlineValidationService {
}
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
TableOfContentItem last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
if (parentDepth == null) {
mainSections.add(tocItem);
} else {
assert last != null;
int lastDepth = getDepth(last.getTextPageBlock().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new TableOfContents(mainSections);
}
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()

View File

@ -131,7 +131,7 @@ public class BlockificationPostprocessingService {
} else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(headlineType));
others.forEach(other -> other.setClassification(null));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType);

View File

@ -5,10 +5,8 @@ import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
@ -100,7 +98,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) {
previous = new TextPageBlock();
continue;
}
@ -230,21 +228,31 @@ public class DocstrumBlockificationService {
continue;
}
if(block.getClassification() != null && block.getClassification().isHeadline()) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) {
if(blocks.get(i) == null){
AbstractPageBlock abstractPageBlock = blocks.get(i);
if(abstractPageBlock == null){
continue;
}
if (blocks.get(i) == current) {
if (abstractPageBlock == current) {
continue;
}
if (blocks.get(i) instanceof TablePageBlock) {
if (abstractPageBlock instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
continue;
}
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {

View File

@ -1,13 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -15,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.Data;
@ -26,8 +22,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final OutlineValidationService outlineValidationService;
public void classifyDocument(ClassificationDocument document) {
@ -35,36 +29,10 @@ public class RedactManagerClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
List<TextPageBlock> headlinesFromOutlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
}
List<TextPageBlock> allHeadlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
TableOfContents toC = outlineValidationService.createToC(allHeadlines);
System.out.println(toC);
outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
}