update to redaction-service state

This commit is contained in:
Kilian Schuettler 2023-07-25 16:10:57 +02:00
parent 143ebee25e
commit 65ab5eca22
19 changed files with 135 additions and 73 deletions

View File

@ -13,8 +13,8 @@ import lombok.experimental.FieldDefaults;
public class DocumentData { public class DocumentData {
DocumentPage[] documentPages; DocumentPage[] documentPages;
DocumentText[] documentTexts; DocumentTextData[] documentTextData;
DocumentPositions[] documentPositions; DocumentPositionData[] documentPositions;
DocumentStructure documentStructure; DocumentStructure documentStructure;

View File

@ -10,7 +10,7 @@ import lombok.experimental.FieldDefaults;
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentPositions { public class DocumentPositionData {
Long id; Long id;
int[] stringIdxToPositionIdx; int[] stringIdxToPositionIdx;

View File

@ -12,7 +12,7 @@ import lombok.experimental.FieldDefaults;
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentText { public class DocumentTextData {
Long id; Long id;
Long page; Long page;

View File

@ -15,8 +15,8 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
@ -72,7 +72,7 @@ public class LayoutParsingStorageService {
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTexts()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
} }
@ -93,18 +93,18 @@ public class LayoutParsingStorageService {
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException { public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
DocumentPage[] documentPageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), DocumentPage[].class); DocumentPage[] documentPageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), DocumentPage[].class);
DocumentText[] documentTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), DocumentTextData[] documentTextDataBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
DocumentText[].class); DocumentTextData[].class);
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
DocumentPositions[].class); DocumentPositionData[].class);
DocumentStructure tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentStructure.class); DocumentStructure tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentStructure.class);
return DocumentData.builder() return DocumentData.builder()
.documentStructure(tableOfContentsData) .documentStructure(tableOfContentsData)
.documentPositions(atomicPositionBlockData) .documentPositions(atomicPositionBlockData)
.documentTexts(documentTextBlockData) .documentTextData(documentTextDataBlockData)
.documentPages(documentPageData) .documentPages(documentPageData)
.build(); .build();
} }

View File

@ -11,8 +11,8 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
@ -109,20 +109,20 @@ public class AtomicTextBlock implements TextBlock {
} }
public static AtomicTextBlock fromAtomicTextBlockData(DocumentText documentText, public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
DocumentPositions documentPositions, DocumentPositionData documentPositionData,
SemanticNode parent, SemanticNode parent,
Page page) { Page page) {
return AtomicTextBlock.builder() return AtomicTextBlock.builder()
.id(documentText.getId()) .id(documentTextData.getId())
.numberOnPage(documentText.getNumberOnPage()) .numberOnPage(documentTextData.getNumberOnPage())
.page(page) .page(page)
.boundary(new Boundary(documentText.getStart(), documentText.getEnd())) .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentText.getSearchText()) .searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentText.getLineBreaks()).boxed().toList()) .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositions.getStringIdxToPositionIdx()).boxed().toList()) .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
.positions(toRectangle2DList(documentPositions.getPositions())) .positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent) .parent(parent)
.build(); .build();
} }

View File

@ -4,12 +4,14 @@ import java.awt.geom.Rectangle2D;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
@ -26,23 +28,25 @@ public class DocumentDataMapper {
public DocumentData toDocumentData(Document document) { public DocumentData toDocumentData(Document document) {
List<DocumentText> documentTextBlockData = document.streamTerminalTextBlocksInOrder() List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct() .distinct()
.map(DocumentDataMapper::toAtomicTextBlockData) .map(DocumentDataMapper::toAtomicTextBlockData)
.toList(); .toList();
List<DocumentPositions> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct() .distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData) .map(DocumentDataMapper::toAtomicPositionBlockData)
.toList(); .toList();
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList(); List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
return DocumentData.builder() return DocumentData.builder()
.documentTexts(documentTextBlockData.toArray(new DocumentText[0])) .documentTextData(documentTextData.toArray(new DocumentTextData[0]))
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositions[0])) .documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0]))
.documentPages(documentPageData.toArray(new DocumentPage[0])) .documentPages(documentPageData.toArray(new DocumentPage[0]))
.documentStructure(tableOfContentsData) .documentStructure(tableOfContentsData)
.build(); .build();
@ -95,9 +99,9 @@ public class DocumentDataMapper {
} }
private DocumentText toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) { private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentText.builder() return DocumentTextData.builder()
.id(atomicTextBlock.getId()) .id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue()) .page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText()) .searchText(atomicTextBlock.getSearchText())
@ -109,9 +113,9 @@ public class DocumentDataMapper {
} }
private DocumentPositions toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) { private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return DocumentPositions.builder() return DocumentPositionData.builder()
.id(atomicTextBlock.getId()) .id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions())) .positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx())) .stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))

View File

@ -7,8 +7,8 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
@ -154,10 +154,10 @@ public class DocumentGraphMapper {
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)), return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent, parent,
getPage(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
} }
@ -180,15 +180,15 @@ public class DocumentGraphMapper {
private final DocumentTree documentTree; private final DocumentTree documentTree;
private final List<Page> pages; private final List<Page> pages;
private final List<DocumentText> documentTextBlockData; private final List<DocumentTextData> documentTextDataBlockData;
private final List<DocumentPositions> atomicPositionBlockData; private final List<DocumentPositionData> atomicPositionBlockData;
Context(DocumentData documentData, DocumentTree documentTree) { Context(DocumentData documentData, DocumentTree documentTree) {
this.documentTree = documentTree; this.documentTree = documentTree;
this.pages = new LinkedList<>(); this.pages = new LinkedList<>();
this.documentTextBlockData = Arrays.stream(documentData.getDocumentTexts()).toList(); this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList(); this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
} }

View File

@ -149,7 +149,7 @@ public class RectangleTransformations {
@Override @Override
public BiConsumer<BBox, Rectangle2D> accumulator() { public BiConsumer<BBox, Rectangle2D> accumulator() {
return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY()); return BBox::addRectangle;
} }
@ -166,7 +166,7 @@ public class RectangleTransformations {
@Override @Override
public Function<BBox, Rectangle2D> finisher() { public Function<BBox, Rectangle2D> finisher() {
return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY); return BBox::toRectangle2D;
} }
@ -187,7 +187,21 @@ public class RectangleTransformations {
Double upperRightY; Double upperRightY;
public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) { public Rectangle2D toRectangle2D() {
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
return new Rectangle2D.Double(0, 0, 0, 0);
}
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
}
public void addRectangle(Rectangle2D rectangle2D) {
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
if (this.lowerLeftX == null) { if (this.lowerLeftX == null) {
this.lowerLeftX = lowerLeftX; this.lowerLeftX = lowerLeftX;

View File

@ -1,20 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.util.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
public class TextPositionOperations { public class TextPositionOperations {
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) { public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
return textBlocks.stream()// var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
.flatMap(tb -> tb.getSequences().stream())//
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)// // because the TextPositionSequenceComparator is not transitive, but
.thenComparing(TextPositionSequence::getMaxXDirAdj))// // JDK7+ enforces transitivity on comparators, we need to use
.toList(); // a custom quicksort implementation (which is slower, unfortunately).
QuickSort.sort(sequence, comparator);
return sequence;
} }
} }

View File

@ -46,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest {
try (InputStream inputStream = new FileInputStream(filename)) { try (InputStream inputStream = new FileInputStream(filename)) {
PDDocument pdDocument = Loader.loadPDF(inputStream); PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
} }
} }

View File

@ -22,7 +22,7 @@ import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends BaseTest { public class BuildDocumentGraphTest extends BaseTest {
@Autowired @Autowired
private LayoutParsingPipeline layoutParsingPipeline; protected LayoutParsingPipeline layoutParsingPipeline;
@Test @Test
@Disabled @Disabled
@ -55,4 +55,5 @@ public class BuildDocumentGraphTest extends BaseTest {
} }
} }
} }

View File

@ -1,14 +1,22 @@
package com.knecon.fforesight.service.layoutparser.server.graph; package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.pdfbox.Loader;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
@ -17,22 +25,50 @@ import lombok.SneakyThrows;
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
@Test @Test
@SneakyThrows
@Disabled
public void writeJsonForAllFilesTest() {
Path path = Path.of("/tmp/test_files");
Files.walk(path)
.map(Path::toFile)
.filter(File::isFile)
.filter(file -> file.getName().endsWith(".pdf"))
.peek(System.out::println)
.map(File::toPath)
.forEach(this::writeJsons);
}
@Test
@SneakyThrows
@Disabled @Disabled
public void writeJsonForFileTest() { public void writeJsonForFileTest() {
writeJsons("files/216"); var resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
writeJsons(resource.getFile().toPath());
} }
@SneakyThrows
private void writeJsons(String filename) {
Document documentGraph = buildGraph(filename); @SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(new FileInputStream(filename.toFile())),
new ImageServiceResponse(),
new TableServiceResponse());
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create(); ObjectMapper mapper = ObjectMapperFactory.create();
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", ".json")), documentData.getDocumentStructure());
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", ".json")), documentData.getDocumentTexts()); var stem = Path.of("/tmp/DocumentGraphJsonWritingTest");
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", ".json")), documentData.getDocumentPositions()); stem.toFile().mkdirs();
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", ".json")), documentData.getDocumentPages()); var tmpFilePath = stem.resolve(filename.getFileName());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions());
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages());
} }
} }

View File

@ -7,9 +7,9 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
@ -32,21 +32,21 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
DocumentData documentData = DocumentDataMapper.toDocumentData(document); DocumentData documentData = DocumentDataMapper.toDocumentData(document);
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTexts()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure());
DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class); DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class);
DocumentText[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentText[].class); DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class);
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
filename + "_POSITIONS" + ".json", filename + "_POSITIONS" + ".json",
DocumentPositions[].class); DocumentPositionData[].class);
DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class); DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class);
DocumentData documentData2 = DocumentData.builder() DocumentData documentData2 = DocumentData.builder()
.documentPages(pageData) .documentPages(pageData)
.documentStructure(documentTreeData) .documentStructure(documentTreeData)
.documentTexts(atomicTextBlockData) .documentTextData(atomicTextBlockData)
.documentPositions(atomicPositionBlockData) .documentPositions(atomicPositionBlockData)
.build(); .build();
Document newDocument = DocumentGraphMapper.toDocumentGraph(documentData2); Document newDocument = DocumentGraphMapper.toDocumentGraph(documentData2);

View File

@ -26,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
@SneakyThrows @SneakyThrows
public void testGapBasedColumnDetection() { public void testGapBasedColumnDetection() {
String filename = "files/211.pdf"; String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction"); System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
@ -52,7 +52,7 @@ class GapAcrossLinesDetectionServiceTest {
@SneakyThrows @SneakyThrows
public void testColumnDetection() { public void testColumnDetection() {
String filename = "files/211.pdf"; String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction"); System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();

View File

@ -27,7 +27,7 @@ class InvisibleTableDetectionServiceTest {
@SneakyThrows @SneakyThrows
public void detectInvisibleTableTest() { public void detectInvisibleTableTest() {
String fileName = "files/211.pdf"; String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());

View File

@ -18,7 +18,7 @@ class MainBodyTextFrameExtractionServiceTest {
@SneakyThrows @SneakyThrows
public void testMainBodyDetection() { public void testMainBodyDetection() {
String fileName = "files/211.pdf"; String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString(); String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);

View File

@ -20,7 +20,7 @@ class PageInformationServiceTest {
@SneakyThrows @SneakyThrows
public void testGapDetection() { public void testGapDetection() {
String filename = "files/211.pdf"; String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction"); System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
@ -43,7 +43,7 @@ class PageInformationServiceTest {
@SneakyThrows @SneakyThrows
public void testLineDetection() { public void testLineDetection() {
String filename = "files/211.pdf"; String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction"); System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();

View File

@ -21,7 +21,7 @@ class TextPositionSequenceSorterTest {
@SneakyThrows @SneakyThrows
public void testTextPositionSequenceExtraction() { public void testTextPositionSequenceExtraction() {
String fileName = "files/211.pdf"; String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);