RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
This commit is contained in:
maverickstuder 2024-04-17 14:31:48 +02:00
parent 59d9d6c3e6
commit 17756f5977
11 changed files with 463 additions and 54 deletions

View File

@ -31,7 +31,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -44,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
@ -231,14 +231,15 @@ public class LayoutParsingPipeline {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument);
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
// parsing the structure elements could be useful as well
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
@ -296,10 +297,11 @@ public class LayoutParsingPipeline {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
};
List<OutlineObject> outlineObjects = outlineObjectTree.getOutlineObjectsPerPage()
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage()
.get(pageNumber - 1);
if(outlineObjects != null) {
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage, outlineObjects);
if (outlineObjects != null) {
classificationPage.setOutlineObjects(outlineObjects);
blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage);
}
classificationPage.setCleanRulings(cleanRulings);
@ -361,7 +363,6 @@ public class LayoutParsingPipeline {
.toList();
// ???
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
@ -26,4 +27,6 @@ public class ClassificationDocument {
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
}

View File

@ -8,13 +8,13 @@ import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data
@RequiredArgsConstructor
@ -23,6 +23,10 @@ public class ClassificationPage {
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.io.IOException;
@ -74,6 +74,8 @@ public class OutlineExtractorService {
}
// if the structure elements are processed beforehand, another case can be handled here as well:
// outline objects can reference structure elements (see pdf documentation)
@SneakyThrows
private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {

View File

@ -39,11 +39,4 @@ public class OutlineObjectTree {
}
}
@Override
public String toString() {
return super.toString();
}
}

View File

@ -0,0 +1,210 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineValidationService {
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
TableOfContents validatedToC = createToC(headlinesFromOutlines);
TableOfContents currentToC = createToC(allHeadlines);
TableOfContentItem lastHeadlineFromOutlines = null;
for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) {
if (!containsItem(validatedToC, tocItem)) {
addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines);
} else {
lastHeadlineFromOutlines = tocItem;
}
}
return validatedToC;
}
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
if(!tocItem.getChildren().isEmpty()) {
}
}
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
int parentDepth = 7; // more than 6 (h6)
TableOfContentItem parent = null;
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
if(parentDepth >= currentDepth) {
parentDepth = currentDepth;
parent = new TableOfContentItem(current);
mainSections.add(parent);
} else {
assert (parent!=null);
while(parentDepth < currentDepth && parent.getParent() != null) {
parent = parent.getParent();
parentDepth = getDepth(parent.getTextPageBlock().getClassification());
}
parent.addChild(new TableOfContentItem(current));
}
}
return new TableOfContents(mainSections);
}
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
.map(textPageBlock -> new OutlineObject(textPageBlock.getText(),
textPageBlock.getPage(),
new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()),
getDepth(textPageBlock.getClassification())))
.toList();
}
private static int getDepth(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public void validate(List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) {
List<Headline> headlines = allHeadlines.stream()
.map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock)))
.toList();
for (TextPageBlock newHeadline : newlyClassifiedHeadlines) {
int newHeadlineIndex = headlines.indexOf(newHeadline);
List<TextPageBlock> adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines);
// Find neighboring headlines from outlines
//TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1);
//TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1);
// If we have neighboring outlines, perform comparison
//if (previousOutline != null && nextOutline != null) {
// // Compare headline orders
// int orderComparison = compareHeadlineOrder(previousOutline, nextOutline);
// if (orderComparison != 0) {
// // Set classification based on comparison
// setClassification(newHeadline, orderComparison, previousOutline, nextOutline);
// }
//}
}
}
}
private List<TextPageBlock> findAdjacentNewlyClassified(TextPageBlock headline, List<TextPageBlock> newlyClassifiedHeadlines) {
// Find adjacent newly classified headlines
List<TextPageBlock> adjacentNewlyClassified = new ArrayList<>();
int index = newlyClassifiedHeadlines.indexOf(headline);
if (index != -1) {
adjacentNewlyClassified.add(headline);
for (int i = index - 1; i >= 0; i--) {
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) {
adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i));
} else {
break;
}
}
for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) {
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) {
adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i));
} else {
break;
}
}
}
return adjacentNewlyClassified;
}
private TextPageBlock findNeighboringOutline(TextPageBlock headline, List<TextPageBlock> headlinesFromOutlines, int direction) {
// Find neighboring headline from outlines in the specified direction
int index = headlinesFromOutlines.indexOf(headline);
if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) {
return headlinesFromOutlines.get(index + direction);
}
return null;
}
private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) {
// Compare headline orders
// Implement your comparison logic here
return 0; // Placeholder return, implement actual comparison logic
}
private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) {
// Set classification based on comparison with neighboring outlines
// Implement your classification logic here
}
record Headline(TextPageBlock textPageBlock, boolean newlyClassified) {
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
Headline headline = (Headline) obj;
return Objects.equals(textPageBlock, headline.textPageBlock);
}
@Override
public int hashCode() {
return Objects.hash(textPageBlock);
}
}
}

View File

@ -0,0 +1,93 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TableOfContentItem {
@EqualsAndHashCode.Include
private TextPageBlock textPageBlock;
private List<TableOfContentItem> children = new ArrayList<>();
private TableOfContentItem parent;
public TableOfContentItem(TextPageBlock textPageBlock) {
this.textPageBlock = textPageBlock;
}
public void addChild(TableOfContentItem tableOfContentItem) {
children.add(tableOfContentItem);
tableOfContentItem.setParent(this);
}
public TableOfContentItem getSiblingBefore() {
try {
return parent.getChildren()
.get(parent.getChildren().indexOf(this) - 1);
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
return null;
}
}
public TableOfContentItem getSiblingAfter() {
try {
return parent.getChildren()
.get(parent.getChildren().indexOf(this) + 1);
} catch (IndexOutOfBoundsException indexOutOfBoundsException) {
return null;
}
}
public boolean contains(TextPageBlock block) {
boolean anyChildContains = false;
if (!children.isEmpty()) {
for (TableOfContentItem child : children) {
if (child.getTextPageBlock().equals(block)) {
return true;
} else {
anyChildContains = anyChildContains || child.contains(block);
}
}
}
return anyChildContains;
}
public boolean contains(TableOfContentItem tocItem) {
boolean anyChildContains = false;
if (!children.isEmpty()) {
for (TableOfContentItem child : children) {
if (child.equals(tocItem)) {
return true;
} else {
anyChildContains = anyChildContains || child.contains(tocItem);
}
}
}
return anyChildContains;
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}';
}
}

View File

@ -0,0 +1,59 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class TableOfContents {
private List<TableOfContentItem> mainSections = new ArrayList<>();
public TableOfContents(List<TableOfContentItem> mainSections) {
this.mainSections = mainSections;
}
public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks);
}
return allTextPageBlocks;
}
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getTextPageBlock());
for (TableOfContentItem child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
}
public List<TableOfContentItem> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTableOfContentItems(item, allItems);
}
return allItems;
}
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
allItems.add(item);
for (TableOfContentItem child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
}
}
}

View File

@ -36,8 +36,9 @@ public class BlockificationPostprocessingService {
.collect(RectangleTransformations.collectBBox());
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
return;
}
@ -244,6 +245,13 @@ public class BlockificationPostprocessingService {
}
// currently only three cases are handled here:
// 1. equality
// 2. outline title contains block text
// 3. block text contains outline title
// another possible case is an intersection, meaning a title is split up between two different blocks
// this should not happen with how docstrum creates the blocks
// if it is indeed necessary, a splitting has to be done with a follow-up merge
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject();

View File

@ -1,8 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -22,15 +27,43 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final OutlineValidationService outlineValidationService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
List<TextPageBlock> headlinesFromOutlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
List<TextPageBlock> allHeadlines = document.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
List<TextPageBlock> newlyClassifiedHeadlines = new ArrayList<>(allHeadlines);
newlyClassifiedHeadlines.removeAll(headlinesFromOutlines);
TableOfContents toC = outlineValidationService.createToC(allHeadlines);
System.out.println(toC);
outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines);
}
@ -48,7 +81,7 @@ public class RedactManagerClassificationService {
var bodyTextFrame = page.getBodyTextFrame();
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
@ -62,33 +95,30 @@ public class RedactManagerClassificationService {
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
} if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
&& (textBlock.getMostPopularWordStyle().equals("bold")
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
@ -96,25 +126,31 @@ public class RedactManagerClassificationService {
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
} else if (!textBlock.getText().startsWith("Figure ")
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& textBlock.getSequences()
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("italic")
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;