RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
77ee8dd5bd
commit
09148960cf
@ -28,9 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -43,7 +44,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
@ -97,6 +97,7 @@ public class LayoutParsingPipeline {
|
||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
ClarifyndClassificationService clarifyndClassificationService;
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
OutlineValidationService outlineValidationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -339,14 +340,15 @@ public class LayoutParsingPipeline {
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
// compute ToC
|
||||
List<AbstractPageBlock> headlines = classificationDocument.getPages()
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb.getClassification().isHeadline()))
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
// ???
|
||||
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||
classificationDocument.setTableOfContents(tableOfContents);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
@ -28,5 +29,6 @@ public class ClassificationDocument {
|
||||
private long rulesVersion;
|
||||
|
||||
private OutlineObjectTree outlineObjectTree;
|
||||
private TableOfContents tableOfContents;
|
||||
|
||||
}
|
||||
|
||||
@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -16,8 +19,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
|
||||
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
TableOfContents validatedToC = createToC(headlinesFromOutlines);
|
||||
TableOfContents currentToC = createToC(allHeadlines);
|
||||
|
||||
@ -32,7 +35,9 @@ public class OutlineValidationService {
|
||||
return validatedToC;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
|
||||
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
@ -41,7 +46,9 @@ public class OutlineValidationService {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
|
||||
|
||||
for (TableOfContentItem existingItem : toc.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
@ -50,6 +57,7 @@ public class OutlineValidationService {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
|
||||
|
||||
//if (lastHeadlineFromOutlines == null || tocItem.g)
|
||||
@ -58,20 +66,21 @@ public class OutlineValidationService {
|
||||
//}
|
||||
}
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
public TableOfContents createToCOld(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
int parentDepth = 7; // more than 6 (h6)
|
||||
TableOfContentItem parent = null;
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getDepth(current.getClassification());
|
||||
if(parentDepth >= currentDepth) {
|
||||
if (parentDepth >= currentDepth) {
|
||||
parentDepth = currentDepth;
|
||||
parent = new TableOfContentItem(current);
|
||||
mainSections.add(parent);
|
||||
} else {
|
||||
assert (parent!=null);
|
||||
while(parentDepth < currentDepth && parent.getParent() != null) {
|
||||
assert (parent != null);
|
||||
while (parentDepth < currentDepth && parent.getParent() != null) {
|
||||
parent = parent.getParent();
|
||||
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
|
||||
}
|
||||
@ -82,6 +91,46 @@ public class OutlineValidationService {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||
TableOfContentItem last = null;
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getDepth(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
var tocItem = new TableOfContentItem(current);
|
||||
|
||||
if (parentDepth == null) {
|
||||
mainSections.add(tocItem);
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getDepth(last.getTextPageBlock().getClassification());
|
||||
|
||||
if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification());
|
||||
}
|
||||
|
||||
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
||||
parent.addChild(tocItem);
|
||||
}
|
||||
|
||||
last = tocItem;
|
||||
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||
depths.add(currentDepth);
|
||||
}
|
||||
|
||||
return new TableOfContents(mainSections);
|
||||
}
|
||||
|
||||
|
||||
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
|
||||
|
||||
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
|
||||
|
||||
@ -131,7 +131,7 @@ public class BlockificationPostprocessingService {
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
others.forEach(other -> other.setClassification(headlineType));
|
||||
others.forEach(other -> other.setClassification(null));
|
||||
} else {
|
||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||
merged.setClassification(headlineType);
|
||||
|
||||
@ -5,10 +5,8 @@ import static java.util.stream.Collectors.toSet;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -100,7 +98,7 @@ public class DocstrumBlockificationService {
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) {
|
||||
previous = new TextPageBlock();
|
||||
continue;
|
||||
}
|
||||
@ -230,21 +228,31 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if(block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
if(blocks.get(i) == null){
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
if(abstractPageBlock == null){
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) == current) {
|
||||
if (abstractPageBlock == current) {
|
||||
continue;
|
||||
}
|
||||
if (blocks.get(i) instanceof TablePageBlock) {
|
||||
if (abstractPageBlock instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
|
||||
@ -1,13 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -15,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.Data;
|
||||
@ -26,8 +22,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final OutlineValidationService outlineValidationService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
@ -35,36 +29,10 @@ public class RedactManagerClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
List<TextPageBlock> headlinesFromOutlines = document.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
|
||||
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
|
||||
}
|
||||
|
||||
List<TextPageBlock> allHeadlines = document.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
|
||||
List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
|
||||
newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
|
||||
|
||||
TableOfContents toC = outlineValidationService.createToC(allHeadlines);
|
||||
System.out.println(toC);
|
||||
|
||||
outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user