RED-7074: Design Subsection section tree structure algorithm
* added redactmanager logic for headline classification to documine and clarifynd * refactored headline classification * added supersection for non-leaf sections (containing other sections instead of only paragraphs, images, ...) * bugfix for certain edge cases in some files running into error state
This commit is contained in:
parent
1856fed640
commit
2d33615b94
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
@ -18,7 +17,6 @@ public class ClassificationDocument {
|
||||
|
||||
private List<ClassificationPage> pages = new ArrayList<>();
|
||||
private List<ClassificationSection> sections = new ArrayList<>();
|
||||
//private Map<TextPageBlock, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
|
||||
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
@ -1,16 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
@ -20,14 +9,4 @@ import lombok.experimental.SuperBuilder;
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class SuperSection extends Section {
|
||||
|
||||
public SuperSection(Set<LayoutEngine> engines,
|
||||
List<Integer> treeId,
|
||||
TextBlock textBlock,
|
||||
DocumentTree documentTree,
|
||||
Set<RedactionEntity> entities,
|
||||
Map<Page, Rectangle2D> bBoxCache) {
|
||||
|
||||
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,6 @@ import java.awt.geom.Point2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
|
||||
@ -1,12 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
|
||||
@Data
|
||||
public class OutlineObjectTreeNode {
|
||||
|
||||
@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class ClarifyndClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
|
||||
@ -6,6 +6,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -23,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
@ -34,6 +36,8 @@ public class DocuMineClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
@ -59,7 +63,8 @@ public class DocuMineClassificationService {
|
||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||
|
||||
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
@ -67,46 +72,57 @@ public class DocuMineClassificationService {
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
} else if (textBlock.getText().length() > 5
|
||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":")
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":")
|
||||
|| textBlock.toString().startsWith("APPENDIX")
|
||||
|| textBlock.toString().startsWith("FIGURE")
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& matcher2.find()) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
@Service
|
||||
@Getter
|
||||
@Setter
|
||||
public class HeadlineClassificationService {
|
||||
|
||||
TextPageBlock lastHeadline;
|
||||
PageBlockType originalClassifiedBlockType;
|
||||
TextPageBlock lastHeadlineFromOutline;
|
||||
|
||||
public void resetContext() {
|
||||
setLastHeadline(null);
|
||||
setOriginalClassifiedBlockType(null);
|
||||
setLastHeadlineFromOutline(null);
|
||||
}
|
||||
|
||||
|
||||
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||
|
||||
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||
this.setLastHeadline(lastHeadlineFromOutline);
|
||||
}
|
||||
|
||||
|
||||
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
|
||||
|
||||
TextPageBlock lastHeadline = getLastHeadline();
|
||||
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||
|
||||
if (lastHeadline != null) {
|
||||
|
||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||
|
||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||
|
||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
||||
}
|
||||
}
|
||||
|
||||
setOriginalClassifiedBlockType(headlineType);
|
||||
textBlock.setClassification(headlineType);
|
||||
setLastHeadline(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -16,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -25,6 +22,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final HeadlineClassificationService headlineClassificationService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
@ -32,33 +31,30 @@ public class RedactManagerClassificationService {
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
|
||||
headlineClassificationService.resetContext();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) {
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext);
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock,
|
||||
ClassificationPage page,
|
||||
ClassificationDocument document,
|
||||
List<Float> headlineFontSizes,
|
||||
HeadLineClassificationContext headLineClassificationContext) {
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headLineClassificationContext.setLastHeadlineFromOutline(textBlock);
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
@ -72,7 +68,8 @@ public class RedactManagerClassificationService {
|
||||
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
} if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
@ -100,7 +97,7 @@ public class RedactManagerClassificationService {
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
@ -113,7 +110,7 @@ public class RedactManagerClassificationService {
|
||||
.get(0).getTextPositions()
|
||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
@ -138,55 +135,4 @@ public class RedactManagerClassificationService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) {
|
||||
|
||||
TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline();
|
||||
TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline();
|
||||
PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType();
|
||||
|
||||
if (lastHeadline != null) {
|
||||
|
||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||
|
||||
headlineType = getNextType(lastHeadline.getClassification());
|
||||
|
||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||
|
||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
||||
}
|
||||
}
|
||||
|
||||
headLineClassificationContext.setOriginalClassifiedBlockType(headlineType);
|
||||
textBlock.setClassification(headlineType);
|
||||
headLineClassificationContext.setLastHeadline(textBlock);
|
||||
}
|
||||
|
||||
|
||||
private static PageBlockType getNextType(PageBlockType pageBlockType) {
|
||||
|
||||
return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Data
|
||||
static class HeadLineClassificationContext {
|
||||
|
||||
TextPageBlock lastHeadline;
|
||||
PageBlockType originalClassifiedBlockType;
|
||||
TextPageBlock lastHeadlineFromOutline;
|
||||
|
||||
|
||||
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||
|
||||
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||
this.setLastHeadline(lastHeadlineFromOutline);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -16,7 +15,6 @@ import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class OutlineProcessingTest extends BuildDocumentTest {
|
||||
|
||||
@Autowired
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
@Autowired
|
||||
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void test() {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,12 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -27,80 +23,11 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@Disabled
|
||||
public void testViewerDocuments() {
|
||||
|
||||
String directory = "files/syngenta_190_deduplicated/";
|
||||
Path dirPath = new ClassPathResource(directory).getFile().toPath();
|
||||
|
||||
// Ensure the directory exists and is accessible
|
||||
if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) {
|
||||
throw new IllegalArgumentException("The specified path must be a directory and it must exist.");
|
||||
}
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
// Use try-with-resources to ensure the stream is closed after use
|
||||
try (Stream<Path> paths = Files.walk(dirPath)) {
|
||||
paths.filter(Files::isRegularFile)
|
||||
.filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files
|
||||
.forEach(path -> processFile(path, layoutGridService));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processFile(Path filePath, LayoutGridService layoutGridService) {
|
||||
|
||||
try {
|
||||
File documentFile = filePath.toFile();
|
||||
String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf";
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filePath.getFileName().toFile().toString()));
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) {
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
} catch (Exception exception)
|
||||
{
|
||||
System.out.println(exception);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
|
||||
|
||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
|
||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
||||
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
|
||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
||||
//String fileName = "files/new/$100m Offers.pdf";
|
||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -108,39 +35,12 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocumentWithImages() {
|
||||
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
Path path = Path.of(fileName);
|
||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
||||
String imageFileName = "files/images/test_outlines.IMAGE_INFO.json";
|
||||
|
||||
var mapper = ObjectMapperFactory.create();
|
||||
var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
documentFile,
|
||||
imageServiceResponse,
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -148,19 +48,18 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
|
||||
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
|
||||
Path path = Path.of(fileName);
|
||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var mapper = ObjectMapperFactory.create();
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", path.getFileName().toFile().toString()));
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user