RED-7074: Design Subsection section tree structure algorithm

* added redactmanager logic for headline classification to documine and clarifynd
* refactored headline classification
* added supersection for non-leaf sections (containing other sections instead of only paragraphs, images, ...)
* bugfix for certain edge cases in some files running into error state
This commit is contained in:
maverickstuder 2024-05-15 10:29:39 +02:00
parent 1856fed640
commit 2d33615b94
13 changed files with 137 additions and 261 deletions

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
@ -18,7 +17,6 @@ public class ClassificationDocument {
private List<ClassificationPage> pages = new ArrayList<>();
private List<ClassificationSection> sections = new ArrayList<>();
//private Map<TextPageBlock, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
private List<ClassificationHeader> headers = new ArrayList<>();
private List<ClassificationFooter> footers = new ArrayList<>();
private List<UnclassifiedText> unclassifiedTexts = new ArrayList<>();

View File

@ -1,10 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

View File

@ -1,16 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.SuperBuilder;
@ -20,14 +9,4 @@ import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
public class SuperSection extends Section {
public SuperSection(Set<LayoutEngine> engines,
List<Integer> treeId,
TextBlock textBlock,
DocumentTree documentTree,
Set<RedactionEntity> entities,
Map<Page, Rectangle2D> bBoxCache) {
super(engines, treeId, textBlock, documentTree, entities, bBoxCache);
}
}

View File

@ -4,7 +4,6 @@ import java.awt.geom.Point2D;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
@Data

View File

@ -1,12 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
@Data
public class OutlineObjectTreeNode {

View File

@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class ClarifyndClassificationService {
private final HeadlineClassificationService headlineClassificationService;
public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {

View File

@ -6,6 +6,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -23,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private final HeadlineClassificationService headlineClassificationService;
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
@ -34,6 +36,8 @@ public class DocuMineClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes);
}
@ -59,7 +63,8 @@ public class DocuMineClassificationService {
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
@ -67,46 +72,57 @@ public class DocuMineClassificationService {
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
} else if (textBlock.getText().length() > 5
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":")
|| textBlock.toString().startsWith("APPENDIX")
|| textBlock.toString().startsWith("FIGURE")
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& matcher2.find()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("italic")
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
@Service
@Getter
@Setter
public class HeadlineClassificationService {
TextPageBlock lastHeadline;
PageBlockType originalClassifiedBlockType;
TextPageBlock lastHeadlineFromOutline;
public void resetContext() {
setLastHeadline(null);
setOriginalClassifiedBlockType(null);
setLastHeadlineFromOutline(null);
}
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
this.setLastHeadline(lastHeadlineFromOutline);
}
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
TextPageBlock lastHeadline = getLastHeadline();
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
}
}
setOriginalClassifiedBlockType(headlineType);
textBlock.setClassification(headlineType);
setLastHeadline(textBlock);
}
}

View File

@ -1,7 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.List;
import java.util.regex.Pattern;
@ -16,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -25,6 +22,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final HeadlineClassificationService headlineClassificationService;
public void classifyDocument(ClassificationDocument document) {
@ -32,33 +31,30 @@ public class RedactManagerClassificationService {
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext();
headlineClassificationService.resetContext();
for (ClassificationPage page : document.getPages()) {
classifyPage(page, document, headlineFontSizes, headLineClassificationContext);
classifyPage(page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes, HeadLineClassificationContext headLineClassificationContext) {
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes, headLineClassificationContext);
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes,
HeadLineClassificationContext headLineClassificationContext) {
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headLineClassificationContext.setLastHeadlineFromOutline(textBlock);
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
return;
}
if (document.getFontSizeCounter().getMostPopular() == null) {
@ -72,7 +68,8 @@ public class RedactManagerClassificationService {
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
} if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
@ -100,7 +97,7 @@ public class RedactManagerClassificationService {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
}
}
@ -113,7 +110,7 @@ public class RedactManagerClassificationService {
.get(0).getTextPositions()
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
classifyHeadline(textBlock, headLineClassificationContext, headlineType);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
@ -138,55 +135,4 @@ public class RedactManagerClassificationService {
}
}
private static void classifyHeadline(TextPageBlock textBlock, HeadLineClassificationContext headLineClassificationContext, PageBlockType headlineType) {
TextPageBlock lastHeadline = headLineClassificationContext.getLastHeadline();
TextPageBlock lastHeadlineFromOutline = headLineClassificationContext.getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = headLineClassificationContext.getOriginalClassifiedBlockType();
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
headlineType = getNextType(lastHeadline.getClassification());
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
}
}
headLineClassificationContext.setOriginalClassifiedBlockType(headlineType);
textBlock.setClassification(headlineType);
headLineClassificationContext.setLastHeadline(textBlock);
}
private static PageBlockType getNextType(PageBlockType pageBlockType) {
return PageBlockType.getHeadlineType(getHeadlineNumber(pageBlockType) + 1);
}
@Data
static class HeadLineClassificationContext {
TextPageBlock lastHeadline;
PageBlockType originalClassifiedBlockType;
TextPageBlock lastHeadlineFromOutline;
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
this.setLastHeadline(lastHeadlineFromOutline);
}
}
}

View File

@ -7,7 +7,6 @@ import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -16,7 +15,6 @@ import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows;
public class OutlineProcessingTest extends BuildDocumentTest {
@Autowired
OutlineExtractorService outlineExtractorService;
@Autowired
BlockificationPostprocessingService blockificationPostprocessingService;
@Test
@SneakyThrows
public void test() {
}
}

View File

@ -1,12 +1,8 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@ -27,80 +23,11 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
@Test
@SneakyThrows
@Disabled
public void testViewerDocuments() {
String directory = "files/syngenta_190_deduplicated/";
Path dirPath = new ClassPathResource(directory).getFile().toPath();
// Ensure the directory exists and is accessible
if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) {
throw new IllegalArgumentException("The specified path must be a directory and it must exist.");
}
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
// Use try-with-resources to ensure the stream is closed after use
try (Stream<Path> paths = Files.walk(dirPath)) {
paths.filter(Files::isRegularFile)
.filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files
.forEach(path -> processFile(path, layoutGridService));
}
}
private void processFile(Path filePath, LayoutGridService layoutGridService) {
try {
File documentFile = filePath.toFile();
String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf";
long start = System.currentTimeMillis();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
documentFile,
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filePath.getFileName().toFile().toString()));
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) {
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000);
}
} catch (Exception exception)
{
System.out.println(exception);
}
}
@Test
@SneakyThrows
public void testViewerDocument() {
//String fileName = "files/documine/21_TiltPlus_MutacaoGenicaEmCelulasBacterianas.pdf";//fail here
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/Study Document 3 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/VV-547521_Irritação_Ocular_in_Vivo.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
//String fileName = "files/documine/A21924A - Acute Oral Toxicity - Rats.pdf";
//String fileName = "files/documine/A16361B - Acute Dermal Irritation Toxicity Study in Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
//String fileName = "files/documine/VV-547523_LLNA.pdf";
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -108,39 +35,12 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@SneakyThrows
public void testViewerDocumentWithImages() {
String fileName = "files/new/UTT-Books-53.pdf";
Path path = Path.of(fileName);
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
String imageFileName = "files/images/test_outlines.IMAGE_INFO.json";
var mapper = ObjectMapperFactory.create();
var imageServiceResponse = mapper.readValue(new ClassPathResource(imageFileName).getInputStream(), ImageServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
documentFile,
imageServiceResponse,
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", path.getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
}
@Test
@Disabled
@SneakyThrows
@ -148,19 +48,18 @@ public class ViewerDocumentTest extends BuildDocumentTest {
String fileName = "files/cv_tables/brokenTablesOnOcr_ocred.pdf";
String tableFileName = "files/cv_tables/brokenTablesOnOcr_ocred.TABLES.json";
Path path = Path.of(fileName);
String tmpFileName = "/tmp/" + path.getFileName() + "_VIEWER.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var mapper = ObjectMapperFactory.create();
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", path.getFileName().toFile().toString()));
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);