RED-7074: Design Subsection section tree structure algorithm
* added redactmanager logic for headline classification to documine and clarifynd * refactored headline classification * added supersection for non-leaf sections (containing other sections instead of only paragraphs, images, ...) * bugfix for certain edge cases in some files running into error state
This commit is contained in:
parent
1856fed640
commit
2d33615b94
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
@ -18,7 +17,6 @@ public class ClassificationDocument {
|
|||||||
|
|
||||||
private List<ClassificationPage> pages = new ArrayList<>();
|
private List<ClassificationPage> pages = new ArrayList<>();
|
||||||
private List<ClassificationSection> sections = new ArrayList<>();
|
private List<ClassificationSection> sections = new ArrayList<>();
|
||||||
//private Map<TextPageBlock, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
|
|
||||||
private List<ClassificationHeader> headers = new ArrayList<>();
|
private List<ClassificationHeader> headers = new ArrayList<>();
|
||||||
private List<ClassificationFooter> footers = new ArrayList<>();
|
private List<ClassificationFooter> footers = new ArrayList<>();
|
||||||
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();
|
||||||
|
|||||||
@ -1,10 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
|||||||
@ -1,16 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
|
||||||
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.experimental.SuperBuilder;
|
import lombok.experimental.SuperBuilder;
|
||||||
@ -20,14 +9,4 @@ import lombok.experimental.SuperBuilder;
|
|||||||
@EqualsAndHashCode(callSuper = true)
|
@EqualsAndHashCode(callSuper = true)
|
||||||
public class SuperSection extends Section {
|
public class SuperSection extends Section {
|
||||||
|
|
||||||
public SuperSection(Set<LayoutEngine> engines,
|
|
||||||
List<Integer> treeId,
|
|
||||||
TextBlock textBlock,
|
|
||||||
DocumentTree documentTree,
|
|
||||||
Set<RedactionEntity> entities,
|
|
||||||
Map<Page, Rectangle2D> bBoxCache) {
|
|
||||||
|
|
||||||
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import java.awt.geom.Point2D;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
|
|||||||
@ -1,12 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Getter;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
public class OutlineObjectTreeNode {
|
public class OutlineObjectTreeNode {
|
||||||
|
|||||||
@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ClarifyndClassificationService {
|
public class ClarifyndClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
|
|||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
|
|||||||
|
|
||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
|
|||||||
.getTextPositions()
|
.getTextPositions()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import java.util.regex.Matcher;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -23,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
@ -34,6 +36,8 @@ public class DocuMineClassificationService {
|
|||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
@ -59,7 +63,8 @@ public class DocuMineClassificationService {
|
|||||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||||
|
|
||||||
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
@ -67,46 +72,57 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
.size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
} else if (textBlock.getText().length() > 5
|
||||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||||
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
.contains(":")
|
||||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":")
|
||||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
|| textBlock.toString().startsWith("APPENDIX")
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
|| textBlock.toString().startsWith("FIGURE")
|
||||||
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
|
&& !textBlock.toString().endsWith(":")
|
||||||
|
&& matcher2.find()) {
|
||||||
|
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||||
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
.getMostPopular()
|
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
|
|||||||
@ -0,0 +1,61 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
public class HeadlineClassificationService {
|
||||||
|
|
||||||
|
TextPageBlock lastHeadline;
|
||||||
|
PageBlockType originalClassifiedBlockType;
|
||||||
|
TextPageBlock lastHeadlineFromOutline;
|
||||||
|
|
||||||
|
public void resetContext() {
|
||||||
|
setLastHeadline(null);
|
||||||
|
setOriginalClassifiedBlockType(null);
|
||||||
|
setLastHeadlineFromOutline(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||||
|
|
||||||
|
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||||
|
this.setLastHeadline(lastHeadlineFromOutline);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
|
||||||
|
|
||||||
|
TextPageBlock lastHeadline = getLastHeadline();
|
||||||
|
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||||
|
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||||
|
|
||||||
|
if (lastHeadline != null) {
|
||||||
|
|
||||||
|
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||||
|
|
||||||
|
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||||
|
|
||||||
|
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||||
|
|
||||||
|
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||||
|
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||||
|
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setOriginalClassifiedBlockType(headlineType);
|
||||||
|
textBlock.setClassification(headlineType);
|
||||||
|
setLastHeadline(textBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,7 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -16,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -25,6 +22,8 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactManagerClassificationService {
|
public class RedactManagerClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
@ -32,33 +31,30 @@ public class RedactManagerClassificationService {
|
|||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) {
|
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||||
if (textBlock instanceof TextPageBlock) {
|
if (textBlock instanceof TextPageBlock) {
|
||||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext);
|
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyBlock(TextPageBlock textBlock,
|
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
ClassificationPage page,
|
|
||||||
ClassificationDocument document,
|
|
||||||
List<Float> headlineFontSizes,
|
|
||||||
HeadLineClassificationContext headLineClassificationContext) {
|
|
||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
headLineClassificationContext.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
@ -72,7 +68,8 @@ public class RedactManagerClassificationService {
|
|||||||
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
} if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
}
|
||||||
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
.getMostPopular())) {
|
.getMostPopular())) {
|
||||||
@ -100,7 +97,7 @@ public class RedactManagerClassificationService {
|
|||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,7 +110,7 @@ public class RedactManagerClassificationService {
|
|||||||
.get(0).getTextPositions()
|
.get(0).getTextPositions()
|
||||||
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||||
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
@ -138,55 +135,4 @@ public class RedactManagerClassificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) {
|
|
||||||
|
|
||||||
TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline();
|
|
||||||
TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline();
|
|
||||||
PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType();
|
|
||||||
|
|
||||||
if (lastHeadline != null) {
|
|
||||||
|
|
||||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
|
||||||
|
|
||||||
headlineType = getNextType(lastHeadline.getClassification());
|
|
||||||
|
|
||||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
|
||||||
|
|
||||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
|
||||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
|
||||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
headLineClassificationContext.setOriginalClassifiedBlockType(headlineType);
|
|
||||||
textBlock.setClassification(headlineType);
|
|
||||||
headLineClassificationContext.setLastHeadline(textBlock);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static PageBlockType getNextType(PageBlockType pageBlockType) {
|
|
||||||
|
|
||||||
return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Data
|
|
||||||
static class HeadLineClassificationContext {
|
|
||||||
|
|
||||||
TextPageBlock lastHeadline;
|
|
||||||
PageBlockType originalClassifiedBlockType;
|
|
||||||
TextPageBlock lastHeadlineFromOutline;
|
|
||||||
|
|
||||||
|
|
||||||
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
|
||||||
|
|
||||||
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
|
||||||
this.setLastHeadline(lastHeadlineFromOutline);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
|
|||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -16,7 +15,6 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
|
|
||||||
public class OutlineProcessingTest extends BuildDocumentTest {
|
|
||||||
|
|
||||||
@Autowired
|
|
||||||
OutlineExtractorService outlineExtractorService;
|
|
||||||
@Autowired
|
|
||||||
BlockificationPostprocessingService blockificationPostprocessingService;
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@SneakyThrows
|
|
||||||
public void test() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,12 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -27,80 +23,11 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||||
|
|
||||||
@Test
|
|
||||||
@SneakyThrows
|
|
||||||
@Disabled
|
|
||||||
public void testViewerDocuments() {
|
|
||||||
|
|
||||||
String directory = "files/syngenta_190_deduplicated/";
|
|
||||||
Path dirPath = new ClassPathResource(directory).getFile().toPath();
|
|
||||||
|
|
||||||
// Ensure the directory exists and is accessible
|
|
||||||
if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) {
|
|
||||||
throw new IllegalArgumentException("The specified path must be a directory and it must exist.");
|
|
||||||
}
|
|
||||||
|
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
|
||||||
|
|
||||||
// Use try-with-resources to ensure the stream is closed after use
|
|
||||||
try (Stream<Path> paths = Files.walk(dirPath)) {
|
|
||||||
paths.filter(Files::isRegularFile)
|
|
||||||
.filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files
|
|
||||||
.forEach(path -> processFile(path, layoutGridService));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void processFile(Path filePath, LayoutGridService layoutGridService) {
|
|
||||||
|
|
||||||
try {
|
|
||||||
File documentFile = filePath.toFile();
|
|
||||||
String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf";
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
|
||||||
documentFile,
|
|
||||||
new ImageServiceResponse(),
|
|
||||||
new TableServiceResponse(),
|
|
||||||
new VisualLayoutParsingResponse(),
|
|
||||||
Map.of("file", filePath.getFileName().toFile().toString()));
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
|
||||||
|
|
||||||
if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) {
|
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
|
||||||
System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000);
|
|
||||||
}
|
|
||||||
} catch (Exception exception)
|
|
||||||
{
|
|
||||||
System.out.println(exception);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
|
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
|
|
||||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
|
||||||
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
|
||||||
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
|
|
||||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
|
||||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
|
||||||
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
|
|
||||||
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
|
|
||||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
|
||||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
|
||||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
|
||||||
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
|
|
||||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
|
||||||
//String fileName = "files/new/kaust-official-thesis-template.pdf";
|
|
||||||
//String fileName = "files/new/$100m Offers.pdf";
|
|
||||||
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
|
||||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
|
||||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -108,39 +35,12 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@SneakyThrows
|
|
||||||
public void testViewerDocumentWithImages() {
|
|
||||||
|
|
||||||
String fileName = "files/new/UTT-Books-53.pdf";
|
|
||||||
Path path = Path.of(fileName);
|
|
||||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
|
||||||
String imageFileName = "files/images/test_outlines.IMAGE_INFO.json";
|
|
||||||
|
|
||||||
var mapper = ObjectMapperFactory.create();
|
|
||||||
var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class);
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
|
||||||
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
|
||||||
documentFile,
|
|
||||||
imageServiceResponse,
|
|
||||||
new TableServiceResponse(),
|
|
||||||
new VisualLayoutParsingResponse(),
|
|
||||||
Map.of("file", path.getFileName().toFile().toString()));
|
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
|
||||||
|
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -148,19 +48,18 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
|
|
||||||
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
|
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
|
||||||
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
|
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
|
||||||
Path path = Path.of(fileName);
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
|
|
||||||
|
|
||||||
var mapper = ObjectMapperFactory.create();
|
var mapper = ObjectMapperFactory.create();
|
||||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||||
documentFile,
|
documentFile,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableResponse,
|
tableResponse,
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file", path.getFileName().toFile().toString()));
|
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user