update to redaction-service state
This commit is contained in:
parent
143ebee25e
commit
65ab5eca22
@ -13,8 +13,8 @@ import lombok.experimental.FieldDefaults;
|
||||
public class DocumentData {
|
||||
|
||||
DocumentPage[] documentPages;
|
||||
DocumentText[] documentTexts;
|
||||
DocumentPositions[] documentPositions;
|
||||
DocumentTextData[] documentTextData;
|
||||
DocumentPositionData[] documentPositions;
|
||||
DocumentStructure documentStructure;
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentPositions {
|
||||
public class DocumentPositionData {
|
||||
|
||||
Long id;
|
||||
int[] stringIdxToPositionIdx;
|
||||
@ -12,7 +12,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentText {
|
||||
public class DocumentTextData {
|
||||
|
||||
Long id;
|
||||
Long page;
|
||||
@ -15,8 +15,8 @@ import org.springframework.stereotype.Service;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
@ -72,7 +72,7 @@ public class LayoutParsingStorageService {
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTexts());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
|
||||
}
|
||||
@ -93,18 +93,18 @@ public class LayoutParsingStorageService {
|
||||
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
DocumentPage[] documentPageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), DocumentPage[].class);
|
||||
DocumentText[] documentTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
DocumentTextData[] documentTextDataBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
DocumentText[].class);
|
||||
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
DocumentTextData[].class);
|
||||
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
DocumentPositions[].class);
|
||||
DocumentPositionData[].class);
|
||||
DocumentStructure tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentStructure.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.documentStructure(tableOfContentsData)
|
||||
.documentPositions(atomicPositionBlockData)
|
||||
.documentTexts(documentTextBlockData)
|
||||
.documentTextData(documentTextDataBlockData)
|
||||
.documentPages(documentPageData)
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -11,8 +11,8 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
@ -109,20 +109,20 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentText documentText,
|
||||
DocumentPositions documentPositions,
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
||||
DocumentPositionData documentPositionData,
|
||||
SemanticNode parent,
|
||||
Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(documentText.getId())
|
||||
.numberOnPage(documentText.getNumberOnPage())
|
||||
.id(documentTextData.getId())
|
||||
.numberOnPage(documentTextData.getNumberOnPage())
|
||||
.page(page)
|
||||
.boundary(new Boundary(documentText.getStart(), documentText.getEnd()))
|
||||
.searchText(documentText.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentText.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositions.getStringIdxToPositionIdx()).boxed().toList())
|
||||
.positions(toRectangle2DList(documentPositions.getPositions()))
|
||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -4,12 +4,14 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||
@ -26,23 +28,25 @@ public class DocumentDataMapper {
|
||||
|
||||
public DocumentData toDocumentData(Document document) {
|
||||
|
||||
List<DocumentText> documentTextBlockData = document.streamTerminalTextBlocksInOrder()
|
||||
List<DocumentTextData> documentTextData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicTextBlockData)
|
||||
.toList();
|
||||
|
||||
List<DocumentPositions> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
|
||||
List<DocumentPositionData> atomicPositionBlockData = document.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicPositionBlockData)
|
||||
.toList();
|
||||
|
||||
Set<Long> nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet());
|
||||
|
||||
List<DocumentPage> documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList();
|
||||
DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree());
|
||||
return DocumentData.builder()
|
||||
.documentTexts(documentTextBlockData.toArray(new DocumentText[0]))
|
||||
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositions[0]))
|
||||
.documentTextData(documentTextData.toArray(new DocumentTextData[0]))
|
||||
.documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0]))
|
||||
.documentPages(documentPageData.toArray(new DocumentPage[0]))
|
||||
.documentStructure(tableOfContentsData)
|
||||
.build();
|
||||
@ -95,9 +99,9 @@ public class DocumentDataMapper {
|
||||
}
|
||||
|
||||
|
||||
private DocumentText toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return DocumentText.builder()
|
||||
return DocumentTextData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
@ -109,9 +113,9 @@ public class DocumentDataMapper {
|
||||
}
|
||||
|
||||
|
||||
private DocumentPositions toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return DocumentPositions.builder()
|
||||
return DocumentPositionData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
|
||||
|
||||
@ -7,8 +7,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
@ -154,10 +154,10 @@ public class DocumentGraphMapper {
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) {
|
||||
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
getPage(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context));
|
||||
}
|
||||
|
||||
|
||||
@ -180,15 +180,15 @@ public class DocumentGraphMapper {
|
||||
|
||||
private final DocumentTree documentTree;
|
||||
private final List<Page> pages;
|
||||
private final List<DocumentText> documentTextBlockData;
|
||||
private final List<DocumentPositions> atomicPositionBlockData;
|
||||
private final List<DocumentTextData> documentTextDataBlockData;
|
||||
private final List<DocumentPositionData> atomicPositionBlockData;
|
||||
|
||||
|
||||
Context(DocumentData documentData, DocumentTree documentTree) {
|
||||
|
||||
this.documentTree = documentTree;
|
||||
this.pages = new LinkedList<>();
|
||||
this.documentTextBlockData = Arrays.stream(documentData.getDocumentTexts()).toList();
|
||||
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
|
||||
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
|
||||
|
||||
}
|
||||
|
||||
@ -149,7 +149,7 @@ public class RectangleTransformations {
|
||||
@Override
|
||||
public BiConsumer<BBox, Rectangle2D> accumulator() {
|
||||
|
||||
return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY());
|
||||
return BBox::addRectangle;
|
||||
}
|
||||
|
||||
|
||||
@ -166,7 +166,7 @@ public class RectangleTransformations {
|
||||
@Override
|
||||
public Function<BBox, Rectangle2D> finisher() {
|
||||
|
||||
return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY);
|
||||
return BBox::toRectangle2D;
|
||||
}
|
||||
|
||||
|
||||
@ -187,7 +187,21 @@ public class RectangleTransformations {
|
||||
Double upperRightY;
|
||||
|
||||
|
||||
public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) {
|
||||
public Rectangle2D toRectangle2D() {
|
||||
|
||||
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
|
||||
return new Rectangle2D.Double(0, 0, 0, 0);
|
||||
}
|
||||
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
|
||||
}
|
||||
|
||||
|
||||
public void addRectangle(Rectangle2D rectangle2D) {
|
||||
|
||||
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
|
||||
if (this.lowerLeftX == null) {
|
||||
this.lowerLeftX = lowerLeftX;
|
||||
|
||||
@ -1,20 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.util.QuickSort;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
public class TextPositionOperations {
|
||||
|
||||
private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator();
|
||||
|
||||
|
||||
public static List<TextPositionSequence> mergeAndSortTextPositionSequenceByYThenX(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream()//
|
||||
.flatMap(tb -> tb.getSequences().stream())//
|
||||
.sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)//
|
||||
.thenComparing(TextPositionSequence::getMaxXDirAdj))//
|
||||
.toList();
|
||||
var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
|
||||
// because the TextPositionSequenceComparator is not transitive, but
|
||||
// JDK7+ enforces transitivity on comparators, we need to use
|
||||
// a custom quicksort implementation (which is slower, unfortunately).
|
||||
QuickSort.sort(sequence, comparator);
|
||||
return sequence;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -46,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest {
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ import lombok.SneakyThrows;
|
||||
public class BuildDocumentGraphTest extends BaseTest {
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
protected LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@ -55,4 +55,5 @@ public class BuildDocumentGraphTest extends BaseTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,14 +1,22 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||
|
||||
@ -17,22 +25,50 @@ import lombok.SneakyThrows;
|
||||
public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@Disabled
|
||||
public void writeJsonForAllFilesTest() {
|
||||
|
||||
Path path = Path.of("/tmp/test_files");
|
||||
|
||||
Files.walk(path)
|
||||
.map(Path::toFile)
|
||||
.filter(File::isFile)
|
||||
.filter(file -> file.getName().endsWith(".pdf"))
|
||||
.peek(System.out::println)
|
||||
.map(File::toPath)
|
||||
.forEach(this::writeJsons);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@Disabled
|
||||
public void writeJsonForFileTest() {
|
||||
|
||||
writeJsons("files/216");
|
||||
var resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
writeJsons(resource.getFile().toPath());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void writeJsons(String filename) {
|
||||
|
||||
Document documentGraph = buildGraph(filename);
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(new FileInputStream(filename.toFile())),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", ".json")), documentData.getDocumentStructure());
|
||||
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", ".json")), documentData.getDocumentTexts());
|
||||
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", ".json")), documentData.getDocumentPositions());
|
||||
mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", ".json")), documentData.getDocumentPages());
|
||||
|
||||
var stem = Path.of("/tmp/DocumentGraphJsonWritingTest");
|
||||
stem.toFile().mkdirs();
|
||||
var tmpFilePath = stem.resolve(filename.getFileName());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions());
|
||||
mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,9 +7,9 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
@ -32,21 +32,21 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTexts());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure());
|
||||
|
||||
DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class);
|
||||
DocumentText[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentText[].class);
|
||||
DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class);
|
||||
DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
filename + "_POSITIONS" + ".json",
|
||||
DocumentPositions[].class);
|
||||
DocumentPositionData[].class);
|
||||
DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class);
|
||||
|
||||
DocumentData documentData2 = DocumentData.builder()
|
||||
.documentPages(pageData)
|
||||
.documentStructure(documentTreeData)
|
||||
.documentTexts(atomicTextBlockData)
|
||||
.documentTextData(atomicTextBlockData)
|
||||
.documentPositions(atomicPositionBlockData)
|
||||
.build();
|
||||
Document newDocument = DocumentGraphMapper.toDocumentGraph(documentData2);
|
||||
|
||||
@ -26,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testGapBasedColumnDetection() {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
@ -52,7 +52,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testColumnDetection() {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -27,7 +27,7 @@ class InvisibleTableDetectionServiceTest {
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/211.pdf";
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
||||
@SneakyThrows
|
||||
public void testMainBodyDetection() {
|
||||
|
||||
String fileName = "files/211.pdf";
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ class PageInformationServiceTest {
|
||||
@SneakyThrows
|
||||
public void testGapDetection() {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
@ -43,7 +43,7 @@ class PageInformationServiceTest {
|
||||
@SneakyThrows
|
||||
public void testLineDetection() {
|
||||
|
||||
String filename = "files/211.pdf";
|
||||
String filename = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -21,7 +21,7 @@ class TextPositionSequenceSorterTest {
|
||||
@SneakyThrows
|
||||
public void testTextPositionSequenceExtraction() {
|
||||
|
||||
String fileName = "files/211.pdf";
|
||||
String fileName = "files/invisible_tables/test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user