RED-7074: Design Subsection section tree structure algorithm
* added supersection and changed logic so that each normal section only contains leaf nodes * added SectionIdentifier logic for headline splitting and merging * fixed many edge cases which resulted in error state files
This commit is contained in:
parent
4e07ba4ff1
commit
2fcaeb3d8c
@ -8,6 +8,7 @@ import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@ -16,13 +17,15 @@ public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
private enum Format {
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
@Getter
|
||||
Format format;
|
||||
@Getter
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
|
||||
@ -20,11 +20,12 @@ import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@Builder
|
||||
@SuperBuilder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Section implements GenericSemanticNode {
|
||||
@ -44,10 +45,6 @@ public class Section implements GenericSemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
boolean isMainSection;
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
@ -0,0 +1,33 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class SuperSection extends Section {
|
||||
|
||||
public SuperSection(Set<LayoutEngine> engines,
|
||||
List<Integer> treeId,
|
||||
TextBlock textBlock,
|
||||
DocumentTree documentTree,
|
||||
Set<RedactionEntity> entities,
|
||||
Map<Page, Rectangle2D> bBoxCache) {
|
||||
|
||||
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
|
||||
}
|
||||
|
||||
}
|
||||
@ -9,10 +9,12 @@ import java.util.ListIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -125,7 +127,7 @@ public class BlockificationPostprocessingService {
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
directMatch.setClassification(headlineType);
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier + outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
others.forEach(other -> other.setClassification(null));
|
||||
} else {
|
||||
@ -279,38 +281,6 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private static WordSequenceResultOld findWordSequenceOld(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = sanitizeString(text);
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
for (TextPositionSequence sequence : textPositionSequences) {
|
||||
|
||||
currentSequence.append(sanitizeString(sequence.toString()));
|
||||
inSequence.add(sequence);
|
||||
|
||||
if (currentSequence.length() > target.length()) {
|
||||
TextPositionSequence removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
|
||||
while (currentSequence.length() > target.length()) {
|
||||
removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
}
|
||||
}
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
return new WordSequenceResultOld(inSequence, preSequence);
|
||||
}
|
||||
}
|
||||
return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>());
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||
|
||||
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||
@ -405,8 +375,21 @@ public class BlockificationPostprocessingService {
|
||||
context.mergeCandidates.add(pageBlock);
|
||||
}
|
||||
|
||||
if (blockTextContainsOutlineTitle && context.splitCandidate == null) {
|
||||
context.splitCandidate = pageBlock;
|
||||
if (blockTextContainsOutlineTitle) {
|
||||
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||
|
||||
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY) {
|
||||
|
||||
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||
context.directMatch = pageBlock;
|
||||
return true;
|
||||
} else if (context.splitCandidate == null) {
|
||||
context.sectionIdentifier = sectionIdentifier.getIdentifierString();
|
||||
}
|
||||
}
|
||||
if (context.splitCandidate == null) {
|
||||
context.splitCandidate = pageBlock;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -414,14 +397,10 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
private static String sanitizeString(String text) {
|
||||
|
||||
return text.replaceAll("\\s", "").toLowerCase(Locale.ROOT);
|
||||
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
|
||||
private record WordSequenceResultOld(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
|
||||
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class OutlineProcessionContext {
|
||||
|
||||
@ -429,6 +408,7 @@ public class BlockificationPostprocessingService {
|
||||
private OutlineObject outlineObject;
|
||||
private List<TextPageBlock> mergeCandidates;
|
||||
private TextPageBlock splitCandidate;
|
||||
private String sectionIdentifier;
|
||||
|
||||
|
||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
@ -437,6 +417,7 @@ public class BlockificationPostprocessingService {
|
||||
this.directMatch = null;
|
||||
this.mergeCandidates = new ArrayList<>();
|
||||
this.splitCandidate = null;
|
||||
this.sectionIdentifier = "";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -90,7 +90,7 @@ public class DocstrumBlockificationService {
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) {
|
||||
if (block instanceof TablePageBlock || previous.isHeadline()) {
|
||||
previous = new TextPageBlock();
|
||||
continue;
|
||||
}
|
||||
@ -98,7 +98,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir()) {
|
||||
if (current.getDir() != previous.getDir() || current.isHeadline()) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
@ -162,6 +162,9 @@ public class DocstrumBlockificationService {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
previous.setToDuplicate(toDuplicate);
|
||||
if(current.getClassification() != null && previous.getClassification() == null) {
|
||||
previous.setClassification(current.getClassification());
|
||||
}
|
||||
itty.remove();
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
|
||||
@ -78,7 +78,13 @@ public class DocumentGraphFactory {
|
||||
|
||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);
|
||||
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||
parent,
|
||||
tocItem.getChildren().isEmpty(),
|
||||
tocItem.getNonEmptySectionBlocks(),
|
||||
tocItem.getImages(),
|
||||
context,
|
||||
document);
|
||||
tocItem.setSection(section.orElse(null));
|
||||
}
|
||||
}
|
||||
|
||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -30,6 +31,7 @@ public class SectionNodeFactory {
|
||||
|
||||
public Optional<Section> addSection(LayoutParsingType layoutParsingType,
|
||||
GenericSemanticNode parentNode,
|
||||
boolean isLeaf,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
@ -50,7 +52,13 @@ public class SectionNodeFactory {
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build();
|
||||
|
||||
Section section;
|
||||
if (isLeaf) {
|
||||
section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
} else {
|
||||
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet()
|
||||
@ -60,12 +68,24 @@ public class SectionNodeFactory {
|
||||
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||
section,
|
||||
true,
|
||||
subSectionPageBlocks,
|
||||
emptyList(),
|
||||
context,
|
||||
document));
|
||||
} else if (!isLeaf) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
}
|
||||
|
||||
@ -146,6 +146,7 @@ public class TableNodeFactory {
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(layoutParsingType,
|
||||
tableCell,
|
||||
true,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
|
||||
@ -179,10 +179,11 @@ public class LayoutGridService {
|
||||
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
|
||||
Page firstPage = semanticNode.getFirstPage();
|
||||
String treeIdString = buildTreeIdString(semanticNode);
|
||||
if (!subSections.isEmpty()) {
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
|
||||
addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
|
||||
} else {
|
||||
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
|
||||
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
|
||||
}
|
||||
if (bBoxMap.values().size() == 1) {
|
||||
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);
|
||||
|
||||
@ -83,6 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
public void testViewerDocument() {
|
||||
|
||||
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||
|
||||
|
||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||
@ -95,7 +99,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user