RED-7074: Design Subsection section tree structure algorithm

* added supersection and changed logic so that each normal section only contains leaf nodes
* added SectionIdentifier logic for headline splitting and merging
* fixed many edge cases which resulted in error state files
This commit is contained in:
maverickstuder 2024-05-14 10:51:05 +02:00
parent 4e07ba4ff1
commit 2fcaeb3d8c
10 changed files with 101 additions and 53 deletions

View File

@ -8,6 +8,7 @@ import java.util.regex.Pattern;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
@AllArgsConstructor @AllArgsConstructor
@ -16,13 +17,15 @@ public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format { public enum Format {
EMPTY, EMPTY,
NUMERICAL, NUMERICAL,
DOCUMENT DOCUMENT
} }
@Getter
Format format; Format format;
@Getter
String identifierString; String identifierString;
List<Integer> identifiers; List<Integer> identifiers;
boolean asChild; boolean asChild;

View File

@ -20,11 +20,12 @@ import lombok.Builder;
import lombok.Data; import lombok.Data;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@Data @Data
@Builder @SuperBuilder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode { public class Section implements GenericSemanticNode {
@ -44,10 +45,6 @@ public class Section implements GenericSemanticNode {
@EqualsAndHashCode.Exclude @EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache; Map<Page, Rectangle2D> bBoxCache;
@EqualsAndHashCode.Exclude
boolean isMainSection;
@Override @Override
public NodeType getType() { public NodeType getType() {

View File

@ -0,0 +1,33 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends Section {
public SuperSection(Set<LayoutEngine> engines,
List<Integer> treeId,
TextBlock textBlock,
DocumentTree documentTree,
Set<RedactionEntity> entities,
Map<Page, Rectangle2D> bBoxCache) {
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
}
}

View File

@ -9,10 +9,12 @@ import java.util.ListIterator;
import java.util.Locale; import java.util.Locale;
import java.util.function.Function; import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -125,7 +127,7 @@ public class BlockificationPostprocessingService {
if (minDistance == distanceToDirectMatch) { if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType); directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) { } else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
splitCandidate.setClassification(headlineType); splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(null)); others.forEach(other -> other.setClassification(null));
} else { } else {
@ -279,38 +281,6 @@ public class BlockificationPostprocessingService {
} }
private static WordSequenceResultOld findWordSequenceOld(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() > target.length()) {
TextPositionSequence removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
while (currentSequence.length() > target.length()) {
removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
}
}
if (currentSequence.toString().equals(target)) {
return new WordSequenceResultOld(inSequence, preSequence);
}
}
return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>());
}
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) { private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
TextPageBlock firstBlock = blocksToMerge.get(0); TextPageBlock firstBlock = blocksToMerge.get(0);
@ -405,8 +375,21 @@ public class BlockificationPostprocessingService {
context.mergeCandidates.add(pageBlock); context.mergeCandidates.add(pageBlock);
} }
if (blockTextContainsOutlineTitle && context.splitCandidate == null) { if (blockTextContainsOutlineTitle) {
context.splitCandidate = pageBlock; SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
context.directMatch = pageBlock;
return true;
} else if (context.splitCandidate == null) {
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
}
}
if (context.splitCandidate == null) {
context.splitCandidate = pageBlock;
}
} }
return false; return false;
} }
@ -414,14 +397,10 @@ public class BlockificationPostprocessingService {
private static String sanitizeString(String text) { private static String sanitizeString(String text) {
return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT); return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
} }
private record WordSequenceResultOld(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
}
@Data @Data
private static class OutlineProcessionContext { private static class OutlineProcessionContext {
@ -429,6 +408,7 @@ public class BlockificationPostprocessingService {
private OutlineObject outlineObject; private OutlineObject outlineObject;
private List<TextPageBlock> mergeCandidates; private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate; private TextPageBlock splitCandidate;
private String sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) { public OutlineProcessionContext(OutlineObject outlineObject) {
@ -437,6 +417,7 @@ public class BlockificationPostprocessingService {
this.directMatch = null; this.directMatch = null;
this.mergeCandidates = new ArrayList<>(); this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null; this.splitCandidate = null;
this.sectionIdentifier = "";
} }
} }

View File

@ -90,7 +90,7 @@ public class DocstrumBlockificationService {
while (itty.hasNext()) { while (itty.hasNext()) {
AbstractPageBlock block = itty.next(); AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) { if (block instanceof TablePageBlock || previous.isHeadline()) {
previous = new TextPageBlock(); previous = new TextPageBlock();
continue; continue;
} }
@ -98,7 +98,7 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) { if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir()) { if (current.getDir() != previous.getDir() || current.isHeadline()) {
previous = current; previous = current;
continue; continue;
} }
@ -162,6 +162,9 @@ public class DocstrumBlockificationService {
previous.getSequences().addAll(current.getSequences()); previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0); previous = buildTextBlock(previous.getSequences(), 0);
previous.setToDuplicate(toDuplicate); previous.setToDuplicate(toDuplicate);
if(current.getClassification() != null && previous.getClassification() == null) {
previous.setClassification(current.getClassification());
}
itty.remove(); itty.remove();
itty.previous(); itty.previous();
itty.set(previous); itty.set(previous);

View File

@ -78,7 +78,13 @@ public class DocumentGraphFactory {
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document); Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType,
parent,
tocItem.getChildren().isEmpty(),
tocItem.getNonEmptySectionBlocks(),
tocItem.getImages(),
context,
document);
tocItem.setSection(section.orElse(null)); tocItem.setSection(section.orElse(null));
} }
} }

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -30,6 +31,7 @@ public class SectionNodeFactory {
public Optional<Section> addSection(LayoutParsingType layoutParsingType, public Optional<Section> addSection(LayoutParsingType layoutParsingType,
GenericSemanticNode parentNode, GenericSemanticNode parentNode,
boolean isLeaf,
List<AbstractPageBlock> pageBlocks, List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images, List<ClassifiedImage> images,
DocumentGraphFactory.Context context, DocumentGraphFactory.Context context,
@ -50,7 +52,13 @@ public class SectionNodeFactory {
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream() Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage)); .collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build();
Section section;
if (isLeaf) {
section = Section.builder().documentTree(context.getDocumentTree()).build();
} else {
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
}
context.getSections().add(section); context.getSections().add(section);
blocksPerPage.keySet() blocksPerPage.keySet()
@ -60,12 +68,24 @@ public class SectionNodeFactory {
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) { if (containsTablesAndTextBlocks(pageBlocks)) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
section, section,
true,
subSectionPageBlocks, subSectionPageBlocks,
emptyList(), emptyList(),
context, context,
document)); document));
} else if (!isLeaf) {
if (pageBlocks.get(0).isHeadline()) {
pageBlocks.remove(0);
}
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
} else { } else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
} }

View File

@ -146,6 +146,7 @@ public class TableNodeFactory {
} else if (firstTextBlockIsHeadline(cell)) { } else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType, SectionNodeFactory.addSection(layoutParsingType,
tableCell, tableCell,
true,
cell.getTextBlocks() cell.getTextBlocks()
.stream() .stream()
.map(tb -> (AbstractPageBlock) tb) .map(tb -> (AbstractPageBlock) tb)

View File

@ -179,10 +179,11 @@ public class LayoutGridService {
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox(); Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList(); List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
Page firstPage = semanticNode.getFirstPage(); Page firstPage = semanticNode.getFirstPage();
String treeIdString = buildTreeIdString(semanticNode);
if (!subSections.isEmpty()) { if (!subSections.isEmpty()) {
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid); addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
} else { } else {
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid))); bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
} }
if (bBoxMap.values().size() == 1) { if (bBoxMap.values().size() == 1) {
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH); Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);

View File

@ -83,6 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() { public void testViewerDocument() {
//String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
@ -95,7 +99,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String fileName = "files/new/UTT-Books-53.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();