RED-7074: Design Subsection section tree structure algorithm

* added supersection and changed logic so that each normal section only contains leaf nodes
* added SectionIdentifier logic for headline splitting and merging
* fixed many edge cases which resulted in error state files
This commit is contained in:
maverickstuder 2024-05-14 10:51:05 +02:00
parent 4e07ba4ff1
commit 2fcaeb3d8c
10 changed files with 101 additions and 53 deletions

View File

@ -8,6 +8,7 @@ import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@ -16,13 +17,15 @@ public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
public enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
@Getter
Format format;
@Getter
String identifierString;
List<Integer> identifiers;
boolean asChild;

View File

@ -20,11 +20,12 @@ import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@SuperBuilder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode {
@ -44,10 +45,6 @@ public class Section implements GenericSemanticNode {
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@EqualsAndHashCode.Exclude
boolean isMainSection;
@Override
public NodeType getType() {

View File

@ -0,0 +1,33 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends Section {
public SuperSection(Set<LayoutEngine> engines,
List<Integer> treeId,
TextBlock textBlock,
DocumentTree documentTree,
Set<RedactionEntity> entities,
Map<Page, Rectangle2D> bBoxCache) {
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
}
}

View File

@ -9,10 +9,12 @@ import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -125,7 +127,7 @@ public class BlockificationPostprocessingService {
if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(null));
} else {
@ -279,38 +281,6 @@ public class BlockificationPostprocessingService {
}
private static WordSequenceResultOld findWordSequenceOld(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() > target.length()) {
TextPositionSequence removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
while (currentSequence.length() > target.length()) {
removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
}
}
if (currentSequence.toString().equals(target)) {
return new WordSequenceResultOld(inSequence, preSequence);
}
}
return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>());
}
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
TextPageBlock firstBlock = blocksToMerge.get(0);
@ -405,8 +375,21 @@ public class BlockificationPostprocessingService {
context.mergeCandidates.add(pageBlock);
}
if (blockTextContainsOutlineTitle && context.splitCandidate == null) {
context.splitCandidate = pageBlock;
if (blockTextContainsOutlineTitle) {
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
} else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
}
}
if (context.splitCandidate == null) {
context.splitCandidate = pageBlock;
}
}
return false;
}
@ -414,14 +397,10 @@ public class BlockificationPostprocessingService {
private static String sanitizeString(String text) {
return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT);
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
}
private record WordSequenceResultOld(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
}
@Data
private static class OutlineProcessionContext {
@ -429,6 +408,7 @@ public class BlockificationPostprocessingService {
private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
private String sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) {
@ -437,6 +417,7 @@ public class BlockificationPostprocessingService {
this.directMatch = null;
this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
this.sectionIdentifier = "";
}
}

View File

@ -90,7 +90,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) {
if (block instanceof TablePageBlock || previous.isHeadline()) {
previous = new TextPageBlock();
continue;
}
@ -98,7 +98,7 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir()) {
if (current.getDir() != previous.getDir() || current.isHeadline()) {
previous = current;
continue;
}
@ -162,6 +162,9 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate);
if(current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification());
}
itty.remove();
itty.previous();
itty.set(previous);

View File

@ -78,7 +78,13 @@ public class DocumentGraphFactory {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(),
context,
document);
tocItem.setSection(section.orElse(null));
}
}

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -30,6 +31,7 @@ public class SectionNodeFactory {
public Optional<Section> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode,
boolean isLeaf,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
@ -50,7 +52,13 @@ public class SectionNodeFactory {
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build();
Section section;
if (isLeaf) {
section = Section.builder().documentTree(context.getDocumentTree()).build();
} else {
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
}
context.getSections().add(section);
blocksPerPage.keySet()
@ -60,12 +68,24 @@ public class SectionNodeFactory {
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section,
true,
subSectionPageBlocks,
emptyList(),
context,
document));
} else if (!isLeaf) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
}

View File

@ -146,6 +146,7 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
true,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)

View File

@ -179,10 +179,11 @@ public class LayoutGridService {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
Page firstPage = semanticNode.getFirstPage();
String treeIdString = buildTreeIdString(semanticNode);
if (!subSections.isEmpty()) {
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
} else {
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
}
if (bBoxMap.values().size() == 1) {
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);

View File

@ -83,6 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() {
//String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
@ -95,7 +99,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String fileName = "files/new/UTT-Books-53.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();