update to redaction-service state

This commit is contained in:
Kilian Schuettler 2023-07-25 16:10:57 +02:00
parent 143ebee25e
commit 65ab5eca22
19 changed files with 135 additions and 73 deletions

View File

@ -13,8 +13,8 @@ import lombok.experimental.FieldDefaults;
public class DocumentData {
DocumentPage[] documentPages;
DocumentText[] documentTexts;
DocumentPositions[] documentPositions;
DocumentTextData[] documentTextData;
DocumentPositionData[] documentPositions;
DocumentStructure documentStructure;

View File

@ -10,7 +10,7 @@ import lombok.experimental.FieldDefaults;
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentPositions {
public class DocumentPositionData {
Long id;
int[] stringIdxToPositionIdx;

View File

@ -12,7 +12,7 @@ import lombok.experimental.FieldDefaults;
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentText {
public class DocumentTextData {
Long id;
Long page;

View File

@ -15,8 +15,8 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
@ -72,7 +72,7 @@ public class LayoutParsingStorageService {
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTexts());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
}
@ -93,18 +93,18 @@ public class LayoutParsingStorageService {
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
DocumentPage[] documentPageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), DocumentPage[].class);
DocumentText[] documentTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
DocumentTextData[] documentTextDataBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
DocumentText[].class);
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
DocumentTextData[].class);
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
DocumentPositions[].class);
DocumentPositionData[].class);
DocumentStructure tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentStructure.class);
return DocumentData.builder()
.documentStructure(tableOfContentsData)
.documentPositions(atomicPositionBlockData)
.documentTexts(documentTextBlockData)
.documentTextData(documentTextDataBlockData)
.documentPages(documentPageData)
.build();
}

View File

@ -11,8 +11,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
@ -109,20 +109,20 @@ public class AtomicTextBlock implements TextBlock {
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentText documentText,
DocumentPositions documentPositions,
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder()
.id(documentText.getId())
.numberOnPage(documentText.getNumberOnPage())
.id(documentTextData.getId())
.numberOnPage(documentTextData.getNumberOnPage())
.page(page)
.boundary(new Boundary(documentText.getStart(), documentText.getEnd()))
.searchText(documentText.getSearchText())
.lineBreaks(Arrays.stream(documentText.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositions.getStringIdxToPositionIdx()).boxed().toList())
.positions(toRectangle2DList(documentPositions.getPositions()))
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
.positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent)
.build();
}

View File

@ -4,12 +4,14 @@ import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
@ -26,23 +28,25 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) {
List<DocumentText> documentTextBlockData = document.streamTerminalTextBlocksInOrder()
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
List<DocumentPositions> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder()
.documentTexts(documentTextBlockData.toArray(new DocumentText[0]))
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositions[0]))
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0]))
.documentPages(documentPageData.toArray(new DocumentPage[0]))
.documentStructure(tableOfContentsData)
.build();
@ -95,9 +99,9 @@ public class DocumentDataMapper {
}
private DocumentText toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentText.builder()
return DocumentTextData.builder()
.id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
@ -109,9 +113,9 @@ public class DocumentDataMapper {
}
private DocumentPositions toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentPositions.builder()
return DocumentPositionData.builder()
.id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))

View File

@ -7,8 +7,8 @@ import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
@ -154,10 +154,10 @@ public class DocumentGraphMapper {
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
getPage(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
}
@ -180,15 +180,15 @@ public class DocumentGraphMapper {
private final DocumentTree documentTree;
private final List<Page> pages;
private final List<DocumentText> documentTextBlockData;
private final List<DocumentPositions> atomicPositionBlockData;
private final List<DocumentTextData> documentTextDataBlockData;
private final List<DocumentPositionData> atomicPositionBlockData;
Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree;
this.pages = new LinkedList<>();
this.documentTextBlockData = Arrays.stream(documentData.getDocumentTexts()).toList();
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
}

View File

@ -149,7 +149,7 @@ public class RectangleTransformations {
@Override
public BiConsumer<BBox, Rectangle2D> accumulator() {
return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY());
return BBox::addRectangle;
}
@ -166,7 +166,7 @@ public class RectangleTransformations {
@Override
public Function<BBox, Rectangle2D> finisher() {
return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY);
return BBox::toRectangle2D;
}
@ -187,7 +187,21 @@ public class RectangleTransformations {
Double upperRightY;
public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) {
public Rectangle2D toRectangle2D() {
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
return new Rectangle2D.Double(0, 0, 0, 0);
}
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
}
public void addRectangle(Rectangle2D rectangle2D) {
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
if (this.lowerLeftX == null) {
this.lowerLeftX = lowerLeftX;

View File

@ -1,20 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.util.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
public class TextPositionOperations {
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
return textBlocks.stream()//
.flatMap(tb -> tb.getSequences().stream())//
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)//
.thenComparing(TextPositionSequence::getMaxXDirAdj))//
.toList();
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
// because the TextPositionSequenceComparator is not transitive, but
// JDK7+ enforces transitivity on comparators, we need to use
// a custom quicksort implementation (which is slower, unfortunately).
QuickSort.sort(sequence, comparator);
return sequence;
}
}

View File

@ -46,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest {
try (InputStream inputStream = new FileInputStream(filename)) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
}
}

View File

@ -22,7 +22,7 @@ import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends BaseTest {
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
protected LayoutParsingPipeline layoutParsingPipeline;
@Test
@Disabled
@ -55,4 +55,5 @@ public class BuildDocumentGraphTest extends BaseTest {
}
}
}

View File

@ -1,14 +1,22 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.pdfbox.Loader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
@ -17,22 +25,50 @@ import lombok.SneakyThrows;
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
@Test
@SneakyThrows
@Disabled
public void writeJsonForAllFilesTest() {
Path path = Path.of("/tmp/test_files");
Files.walk(path)
.map(Path::toFile)
.filter(File::isFile)
.filter(file -> file.getName().endsWith(".pdf"))
.peek(System.out::println)
.map(File::toPath)
.forEach(this::writeJsons);
}
@Test
@SneakyThrows
@Disabled
public void writeJsonForFileTest() {
writeJsons("files/216");
var resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
writeJsons(resource.getFile().toPath());
}
@SneakyThrows
private void writeJsons(String filename) {
Document documentGraph = buildGraph(filename);
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(new FileInputStream(filename.toFile())),
new ImageServiceResponse(),
new TableServiceResponse());
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create();
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", ".json")), documentData.getDocumentStructure());
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", ".json")), documentData.getDocumentTexts());
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", ".json")), documentData.getDocumentPositions());
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", ".json")), documentData.getDocumentPages());
var stem = Path.of("/tmp/DocumentGraphJsonWritingTest");
stem.toFile().mkdirs();
var tmpFilePath = stem.resolve(filename.getFileName());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages());
}
}

View File

@ -7,9 +7,9 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
@ -32,21 +32,21 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTexts());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure());
DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class);
DocumentText[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentText[].class);
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class);
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
filename + "_POSITIONS" + ".json",
DocumentPositions[].class);
DocumentPositionData[].class);
DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class);
DocumentData documentData2 = DocumentData.builder()
.documentPages(pageData)
.documentStructure(documentTreeData)
.documentTexts(atomicTextBlockData)
.documentTextData(atomicTextBlockData)
.documentPositions(atomicPositionBlockData)
.build();
Document newDocument = DocumentGraphMapper.toDocumentGraph(documentData2);

View File

@ -26,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
@SneakyThrows
public void testGapBasedColumnDetection() {
String filename = "files/211.pdf";
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
@ -52,7 +52,7 @@ class GapAcrossLinesDetectionServiceTest {
@SneakyThrows
public void testColumnDetection() {
String filename = "files/211.pdf";
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();

View File

@ -27,7 +27,7 @@ class InvisibleTableDetectionServiceTest {
@SneakyThrows
public void detectInvisibleTableTest() {
String fileName = "files/211.pdf";
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());

View File

@ -18,7 +18,7 @@ class MainBodyTextFrameExtractionServiceTest {
@SneakyThrows
public void testMainBodyDetection() {
String fileName = "files/211.pdf";
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);

View File

@ -20,7 +20,7 @@ class PageInformationServiceTest {
@SneakyThrows
public void testGapDetection() {
String filename = "files/211.pdf";
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
@ -43,7 +43,7 @@ class PageInformationServiceTest {
@SneakyThrows
public void testLineDetection() {
String filename = "files/211.pdf";
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();

View File

@ -21,7 +21,7 @@ class TextPositionSequenceSorterTest {
@SneakyThrows
public void testTextPositionSequenceExtraction() {
String fileName = "files/211.pdf";
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);