RED-7074: Design Subsection section tree structure algorithm

* refactoring
This commit is contained in:
maverickstuder 2024-05-02 10:36:36 +02:00
parent 9bf2f5c56c
commit f7aeb9a406
12 changed files with 131 additions and 310 deletions

View File

@ -29,7 +29,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -55,6 +58,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
@ -357,7 +361,7 @@ public class LayoutParsingPipeline {
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline())
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
@ -368,9 +372,6 @@ public class LayoutParsingPipeline {
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> {
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
}
}

View File

@ -31,6 +31,19 @@ public enum PageBlockType {
}
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);

View File

@ -26,10 +26,6 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -53,8 +49,10 @@ public class OutlineExtractorService {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
for (PDOutlineItem child : documentOutline.children()) {
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
rootNodes.add(createOutlineObjectWithChildren(child, document, 1));
}
}
return new OutlineObjectTree(rootNodes);

View File

@ -1,16 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@ -19,79 +18,6 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class OutlineValidationService {
public TableOfContents validateWithToC(List<TextPageBlock> allHeadlines, List<TextPageBlock> headlinesFromOutlines, List<TextPageBlock> newlyClassifiedHeadlines) {
TableOfContents validatedToC = createToC(headlinesFromOutlines);
TableOfContents currentToC = createToC(allHeadlines);
TableOfContentItem lastHeadlineFromOutlines = null;
for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) {
if (!containsItem(validatedToC, tocItem)) {
addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines);
} else {
lastHeadlineFromOutlines = tocItem;
}
}
return validatedToC;
}
private boolean containsBlock(TableOfContents toc, TextPageBlock block) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : toc.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) {
//if (lastHeadlineFromOutlines == null || tocItem.g)
//if(!tocItem.getChildren().isEmpty()) {
//
//}
}
public TableOfContents createToCOld(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
int parentDepth = 7; // more than 6 (h6)
TableOfContentItem parent = null;
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
if (parentDepth >= currentDepth) {
parentDepth = currentDepth;
parent = new TableOfContentItem(current);
mainSections.add(parent);
} else {
assert (parent != null);
while (parentDepth < currentDepth && parent.getParent() != null) {
parent = parent.getParent();
parentDepth = getDepth(parent.getHeadline().getClassification());
}
parent.addChild(new TableOfContentItem(current));
}
}
return new TableOfContents(mainSections);
}
public TableOfContents createToC(List<TextPageBlock> headlines) {
List<TableOfContentItem> mainSections = new ArrayList<>();
@ -100,7 +26,7 @@ public class OutlineValidationService {
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getDepth(current.getClassification());
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new TableOfContentItem(current);
@ -110,12 +36,12 @@ public class OutlineValidationService {
} else {
assert last != null;
int lastDepth = getDepth(last.getHeadline().getClassification());
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getDepth(last.getParent().getHeadline().getClassification());
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
@ -130,131 +56,4 @@ public class OutlineValidationService {
return new TableOfContents(mainSections);
}
public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
List<OutlineObject> newOutlineObjects = newlyClassifiedHeadlines.stream()
.map(textPageBlock -> new OutlineObject(textPageBlock.getText(),
textPageBlock.getPage(),
new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()),
getDepth(textPageBlock.getClassification())))
.toList();
}
private static int getDepth(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public void validate(List<TextPageBlock> allHeadlines, List<TextPageBlock> newlyClassifiedHeadlines) {
if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) {
List<Headline> headlines = allHeadlines.stream()
.map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock)))
.toList();
for (TextPageBlock newHeadline : newlyClassifiedHeadlines) {
int newHeadlineIndex = headlines.indexOf(newHeadline);
List<TextPageBlock> adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines);
// Find neighboring headlines from outlines
//TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1);
//TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1);
// If we have neighboring outlines, perform comparison
//if (previousOutline != null && nextOutline != null) {
// // Compare headline orders
// int orderComparison = compareHeadlineOrder(previousOutline, nextOutline);
// if (orderComparison != 0) {
// // Set classification based on comparison
// setClassification(newHeadline, orderComparison, previousOutline, nextOutline);
// }
//}
}
}
}
private List<TextPageBlock> findAdjacentNewlyClassified(TextPageBlock headline, List<TextPageBlock> newlyClassifiedHeadlines) {
// Find adjacent newly classified headlines
List<TextPageBlock> adjacentNewlyClassified = new ArrayList<>();
int index = newlyClassifiedHeadlines.indexOf(headline);
if (index != -1) {
adjacentNewlyClassified.add(headline);
for (int i = index - 1; i >= 0; i--) {
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) {
adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i));
} else {
break;
}
}
for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) {
if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) {
adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i));
} else {
break;
}
}
}
return adjacentNewlyClassified;
}
private TextPageBlock findNeighboringOutline(TextPageBlock headline, List<TextPageBlock> headlinesFromOutlines, int direction) {
// Find neighboring headline from outlines in the specified direction
int index = headlinesFromOutlines.indexOf(headline);
if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) {
return headlinesFromOutlines.get(index + direction);
}
return null;
}
private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) {
// Compare headline orders
// Implement your comparison logic here
return 0; // Placeholder return, implement actual comparison logic
}
private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) {
// Set classification based on comparison with neighboring outlines
// Implement your classification logic here
}
record Headline(TextPageBlock textPageBlock, boolean newlyClassified) {
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
Headline headline = (Headline) obj;
return Objects.equals(textPageBlock, headline.textPageBlock);
}
@Override
public int hashCode() {
return Objects.hash(textPageBlock);
}
}
}

View File

@ -28,14 +28,9 @@ public class TOCEnrichmentService {
TableOfContents toc = document.getTableOfContents();
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
//Map<TableOfContentItem, List<AbstractPageBlock>> sectionsMap = new HashMap<>();
TableOfContentItem currentSection = null;
boolean foundFirstHeadline = false;
//for (TableOfContentItem item : toc.getAllTableOfContentItems()) {
// sectionsMap.put(item, new ArrayList<>());
//}
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
@ -90,7 +85,6 @@ public class TOCEnrichmentService {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
//sectionsMap.get(currentSection).add(current);
}
}
}
@ -179,7 +173,6 @@ public class TOCEnrichmentService {
unassigned.setImages(startImages);
document.getTableOfContents().getMainSections().add(0, unassigned);
}
//document.setSectionsMap(sectionsMap);
document.setHeaders(headers);
document.setFooters(footers);
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;

View File

@ -43,7 +43,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
}
}
public List<TableOfContentItem> getAllTableOfContentItems() {
List<TableOfContentItem> allItems = new ArrayList<>();
for (TableOfContentItem item : mainSections) {
collectTableOfContentItems(item, allItems);
@ -51,7 +53,9 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
return allItems;
}
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
allItems.add(item);
for (TableOfContentItem child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
@ -59,39 +63,74 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
}
private boolean containsBlock(TextPageBlock block) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(TableOfContentItem tocItem) {
for (TableOfContentItem existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
@Override
public @NonNull Iterator<TableOfContentItem> iterator() {
return new TableOfContentItemIterator(mainSections);
}
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator());
}
@Override
public boolean hasNext() {
ensureStackTopIsCurrent();
return !stack.isEmpty() && stack.peek().hasNext();
}
@Override
public TableOfContentItem next() {
ensureStackTopIsCurrent();
TableOfContentItem currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren().iterator());
stack.push(currentItem.getChildren()
.iterator());
}
return currentItem;
}
private void ensureStackTopIsCurrent() {
while (!stack.isEmpty() && !stack.peek().hasNext()) {
stack.pop();
}
}
}
}

View File

@ -27,6 +27,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@Deprecated
public class SectionsBuilderService {

View File

@ -4,16 +4,12 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.function.Function;
import org.springframework.stereotype.Service;
import org.tinspin.index.Index;
import org.tinspin.index.kdtree.KDIterator;
import org.tinspin.index.kdtree.KDTree;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
@ -262,13 +258,6 @@ public class BlockificationPostprocessingService {
}
private static void addNeighborsOfCandidate(KDTree<TextPageBlock> kdTree, TextPageBlock mergeCandidate, List<TextPageBlock> allMergeCandidates) {
var boundingBox = blockToBoundingBox.apply(mergeCandidate);
Index.PointIteratorKnn<TextPageBlock> knnIterator = kdTree.queryKnn(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, 4);
knnIterator.forEachRemaining(neighbor -> allMergeCandidates.add(neighbor.value()));
}
// currently only three cases are handled here:
// 1. equality
@ -335,58 +324,4 @@ public class BlockificationPostprocessingService {
}
@Deprecated
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) {
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
return;
}
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
float pageHeight = classificationPage.getPageHeight();
for (OutlineObject outlineObject : outlineObjects) {
// kd tree contains yx coordinates
KDIterator<TextPageBlock> successorIterator = kdTree.query(new double[]{ //
pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD, 0, //
//
}, //
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
boolean earlyStop = false;
while (successorIterator.hasNext() && !earlyStop) {
TextPageBlock pageBlock = successorIterator.next().value();
earlyStop = processOutlineForTextBlock(pageBlock, context);
processOutlineForTextBlock(pageBlock, context);
}
selectMatch(classificationPage, context);
}
}
@Deprecated
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
KDTree<TextPageBlock> kdTree = KDTree.create(2);
// insert y first then x, use pdf max y so that the page height is subtracted so that the order is inverted
textBlocks.forEach(block -> {
//var boundingBox = blockToBoundingBox.apply(block);
kdTree.insert(new double[]{block.getMinY(), block.getMinX()}, block);
});
return kdTree;
}
}

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.List;
import java.util.regex.Pattern;
@ -169,17 +171,6 @@ public class RedactManagerClassificationService {
}
private static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1 -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
@Data

View File

@ -76,9 +76,6 @@ public class DocumentGraphFactory {
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
//classificationDocument.getSections()
// .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
Optional<Section> section = SectionNodeFactory.addSection(layoutParsingType, parent, tocItem.getNonEmptySectionBlocks(), tocItem.getImages(), context, document);

View File

@ -1,10 +1,12 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@ -25,14 +27,66 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
@Test
@SneakyThrows
public void testViewerDocuments() {
String directory = "files/syngenta_190_deduplicated/";
Path dirPath = new ClassPathResource(directory).getFile().toPath();
// Ensure the directory exists and is accessible
if (!Files.exists(dirPath) || !Files.isDirectory(dirPath)) {
throw new IllegalArgumentException("The specified path must be a directory and it must exist.");
}
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
// Use try-with-resources to ensure the stream is closed after use
try (Stream<Path> paths = Files.walk(dirPath)) {
paths.filter(Files::isRegularFile)
.filter(path -> path.toString().endsWith(".pdf")) // Filter to process only PDF files
.forEach(path -> processFile(path, layoutGridService));
}
}
private void processFile(Path filePath, LayoutGridService layoutGridService) {
try {
File documentFile = filePath.toFile();
String tmpFileName = "/tmp/" + filePath.getFileName().toString() + "_VIEWER.pdf";
long start = System.currentTimeMillis();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
documentFile,
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file", filePath.getFileName().toFile().toString()));
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
if (classificationDocument.getOutlineObjectTree().getRootNodes().size() > 1) {
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Processed %s in %.2fs%n", filePath, ((float) (System.currentTimeMillis() - start)) / 1000);
}
} catch (Exception exception)
{
System.out.println(exception);
}
}
@Test
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
//String fileName = "files/new/kaust-official-thesis-template.pdf";
//String fileName = "files/new/$100m Offers.pdf";
String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
@ -48,6 +102,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@SneakyThrows
public void testViewerDocumentWithImages() {
@ -90,11 +145,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", path.getFileName().toFile().toString()));
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),
Map.of("file", path.getFileName().toFile().toString()));
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);