diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java index 4d135db..3aec1ec 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java @@ -13,8 +13,8 @@ import lombok.experimental.FieldDefaults; public class DocumentData { DocumentPage[] documentPages; - DocumentText[] documentTexts; - DocumentPositions[] documentPositions; + DocumentTextData[] documentTextData; + DocumentPositionData[] documentPositions; DocumentStructure documentStructure; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositions.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositionData.java similarity index 91% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositions.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositionData.java index 14cbfd3..4a21644 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositions.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentPositionData.java @@ -10,7 +10,7 @@ import lombok.experimental.FieldDefaults; @Builder @AllArgsConstructor @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class DocumentPositions { +public class DocumentPositionData { Long id; int[] stringIdxToPositionIdx; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentText.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTextData.java similarity index 93% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentText.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTextData.java index a420391..09b90d8 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentText.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTextData.java @@ -12,7 +12,7 @@ import lombok.experimental.FieldDefaults; @Builder @AllArgsConstructor @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class DocumentText { +public class DocumentTextData { Long id; Long page; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 4174dfe..64d2f10 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -15,8 +15,8 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; import com.iqser.red.storage.commons.service.StorageService; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; @@ -72,7 +72,7 @@ public class LayoutParsingStorageService { public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure()); - storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTexts()); + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages()); } @@ -93,18 +93,18 @@ public class LayoutParsingStorageService { public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException { DocumentPage[] documentPageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), DocumentPage[].class); - DocumentText[] documentTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), + DocumentTextData[] documentTextDataBlockData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), - DocumentText[].class); - DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), + DocumentTextData[].class); + DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), - DocumentPositions[].class); + DocumentPositionData[].class); DocumentStructure tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentStructure.class); return DocumentData.builder() .documentStructure(tableOfContentsData) .documentPositions(atomicPositionBlockData) - .documentTexts(documentTextBlockData) + .documentTextData(documentTextDataBlockData) .documentPages(documentPageData) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java index 3589743..85719de 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java @@ -11,8 +11,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; @@ -109,20 +109,20 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromAtomicTextBlockData(DocumentText documentText, - DocumentPositions documentPositions, + public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, + DocumentPositionData documentPositionData, SemanticNode parent, Page page) { return AtomicTextBlock.builder() - .id(documentText.getId()) - .numberOnPage(documentText.getNumberOnPage()) + .id(documentTextData.getId()) + .numberOnPage(documentTextData.getNumberOnPage()) .page(page) - .boundary(new Boundary(documentText.getStart(), documentText.getEnd())) - .searchText(documentText.getSearchText()) - .lineBreaks(Arrays.stream(documentText.getLineBreaks()).boxed().toList()) - .stringIdxToPositionIdx(Arrays.stream(documentPositions.getStringIdxToPositionIdx()).boxed().toList()) - .positions(toRectangle2DList(documentPositions.getPositions())) + .boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd())) + .searchText(documentTextData.getSearchText()) + .lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList()) + .stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList()) + .positions(toRectangle2DList(documentPositionData.getPositions())) .parent(parent) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java index ff5d455..c9559ee 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java @@ -4,12 +4,14 @@ import java.awt.geom.Rectangle2D; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; @@ -26,23 +28,25 @@ public class DocumentDataMapper { public DocumentData toDocumentData(Document document) { - List documentTextBlockData = document.streamTerminalTextBlocksInOrder() + List documentTextData = document.streamTerminalTextBlocksInOrder() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicTextBlockData) .toList(); - List atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() + List atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicPositionBlockData) .toList(); + Set nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet()); + List documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList(); DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); return DocumentData.builder() - .documentTexts(documentTextBlockData.toArray(new DocumentText[0])) - .documentPositions(atomicPositionBlockData.toArray(new DocumentPositions[0])) + .documentTextData(documentTextData.toArray(new DocumentTextData[0])) + .documentPositions(atomicPositionBlockData.toArray(new DocumentPositionData[0])) .documentPages(documentPageData.toArray(new DocumentPage[0])) .documentStructure(tableOfContentsData) .build(); @@ -95,9 +99,9 @@ public class DocumentDataMapper { } - private DocumentText toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) { + private DocumentTextData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) { - return DocumentText.builder() + return DocumentTextData.builder() .id(atomicTextBlock.getId()) .page(atomicTextBlock.getPage().getNumber().longValue()) .searchText(atomicTextBlock.getSearchText()) @@ -109,9 +113,9 @@ public class DocumentDataMapper { } - private DocumentPositions toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) { + private DocumentPositionData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) { - return DocumentPositions.builder() + return DocumentPositionData.builder() .id(atomicTextBlock.getId()) .positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions())) .stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx())) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java index cc3b441..b2b3472 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java @@ -7,8 +7,8 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; @@ -154,10 +154,10 @@ public class DocumentGraphMapper { private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { - return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)), + return AtomicTextBlock.fromAtomicTextBlockData(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)), context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), parent, - getPage(context.documentTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + getPage(context.documentTextDataBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); } @@ -180,15 +180,15 @@ public class DocumentGraphMapper { private final DocumentTree documentTree; private final List pages; - private final List documentTextBlockData; - private final List atomicPositionBlockData; + private final List documentTextDataBlockData; + private final List atomicPositionBlockData; Context(DocumentData documentData, DocumentTree documentTree) { this.documentTree = documentTree; this.pages = new LinkedList<>(); - this.documentTextBlockData = Arrays.stream(documentData.getDocumentTexts()).toList(); + this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList(); this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 8dc23ad..2a7b4eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -149,7 +149,7 @@ public class RectangleTransformations { @Override public BiConsumer accumulator() { - return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY()); + return BBox::addRectangle; } @@ -166,7 +166,7 @@ public class RectangleTransformations { @Override public Function finisher() { - return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY); + return BBox::toRectangle2D; } @@ -187,7 +187,21 @@ public class RectangleTransformations { Double upperRightY; - public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) { + public Rectangle2D toRectangle2D() { + + if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) { + return new Rectangle2D.Double(0, 0, 0, 0); + } + return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY); + } + + + public void addRectangle(Rectangle2D rectangle2D) { + + double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double lowerLeftY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()); + double upperRightX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double upperRightY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY()); if (this.lowerLeftX == null) { this.lowerLeftX = lowerLeftX; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index fbd57c4..6e8dd5a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -1,20 +1,27 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; -import java.util.Comparator; import java.util.List; +import java.util.stream.Collectors; + +import org.apache.pdfbox.util.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; public class TextPositionOperations { + private static final TextPositionSequenceComparator comparator = new TextPositionSequenceComparator(); + + public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { - return textBlocks.stream()// - .flatMap(tb -> tb.getSequences().stream())// - .sorted(Comparator.comparingDouble(TextPositionSequence::getMaxYDirAdj)// - .thenComparing(TextPositionSequence::getMaxXDirAdj))// - .toList(); + var sequence = textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList()); + + // because the TextPositionSequenceComparator is not transitive, but + // JDK7+ enforces transitivity on comparators, we need to use + // a custom quicksort implementation (which is slower, unfortunately). + QuickSort.sort(sequence, comparator); + return sequence; } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 86b2a66..34a5958 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -46,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest { try (InputStream inputStream = new FileInputStream(filename)) { PDDocument pdDocument = Loader.loadPDF(inputStream); - return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); + return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index 094cb5c..71f9ff5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -22,7 +22,7 @@ import lombok.SneakyThrows; public class BuildDocumentGraphTest extends BaseTest { @Autowired - private LayoutParsingPipeline layoutParsingPipeline; + protected LayoutParsingPipeline layoutParsingPipeline; @Test @Disabled @@ -55,4 +55,5 @@ public class BuildDocumentGraphTest extends BaseTest { } } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 1527a54..4d258d6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -1,14 +1,22 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.pdfbox.Loader; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; @@ -17,22 +25,50 @@ import lombok.SneakyThrows; public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { @Test + @SneakyThrows + @Disabled + public void writeJsonForAllFilesTest() { + + Path path = Path.of("/tmp/test_files"); + + Files.walk(path) + .map(Path::toFile) + .filter(File::isFile) + .filter(file -> file.getName().endsWith(".pdf")) + .peek(System.out::println) + .map(File::toPath) + .forEach(this::writeJsons); + } + + + @Test + @SneakyThrows @Disabled public void writeJsonForFileTest() { - writeJsons("files/216"); + var resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); + writeJsons(resource.getFile().toPath()); } - @SneakyThrows - private void writeJsons(String filename) { - Document documentGraph = buildGraph(filename); + @SneakyThrows + private void writeJsons(Path filename) { + + Document documentGraph = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Loader.loadPDF(new FileInputStream(filename.toFile())), + new ImageServiceResponse(), + new TableServiceResponse()); + DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", ".json")), documentData.getDocumentStructure()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", ".json")), documentData.getDocumentTexts()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", ".json")), documentData.getDocumentPositions()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", ".json")), documentData.getDocumentPages()); + + var stem = Path.of("/tmp/DocumentGraphJsonWritingTest"); + stem.toFile().mkdirs(); + var tmpFilePath = stem.resolve(filename.getFileName()); + mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_structure" + ".json")), documentData.getDocumentStructure()); + mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_text" + ".json")), documentData.getDocumentTextData()); + mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_positions" + ".json")), documentData.getDocumentPositions()); + mapper.writeValue(new FileOutputStream(new File(tmpFilePath + "_pages" + ".json")), documentData.getDocumentPages()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index 6c59c1c..63a8c58 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -7,9 +7,9 @@ import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositions; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentText; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; @@ -32,21 +32,21 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest { DocumentData documentData = DocumentDataMapper.toDocumentData(document); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", documentData.getDocumentPages()); - storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTexts()); + storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", documentData.getDocumentTextData()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", documentData.getDocumentPositions()); storageService.storeJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", documentData.getDocumentStructure()); DocumentPage[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_PAGES" + ".json", DocumentPage[].class); - DocumentText[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentText[].class); - DocumentPositions[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), + DocumentTextData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_TEXT" + ".json", DocumentTextData[].class); + DocumentPositionData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_POSITIONS" + ".json", - DocumentPositions[].class); + DocumentPositionData[].class); DocumentStructure documentTreeData = storageService.readJSONObject(TenantContext.getTenantId(), filename + "_STRUCTURE" + ".json", DocumentStructure.class); DocumentData documentData2 = DocumentData.builder() .documentPages(pageData) .documentStructure(documentTreeData) - .documentTexts(atomicTextBlockData) + .documentTextData(atomicTextBlockData) .documentPositions(atomicPositionBlockData) .build(); Document newDocument = DocumentGraphMapper.toDocumentGraph(documentData2); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java index 56d0126..4add2a3 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -26,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest { @SneakyThrows public void testGapBasedColumnDetection() { - String filename = "files/211.pdf"; + String filename = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); @@ -52,7 +52,7 @@ class GapAcrossLinesDetectionServiceTest { @SneakyThrows public void testColumnDetection() { - String filename = "files/211.pdf"; + String filename = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java index 628a66f..b1bdeaa 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -27,7 +27,7 @@ class InvisibleTableDetectionServiceTest { @SneakyThrows public void detectInvisibleTableTest() { - String fileName = "files/211.pdf"; + String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); List pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java index cb50c0a..b9f04bc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java @@ -18,7 +18,7 @@ class MainBodyTextFrameExtractionServiceTest { @SneakyThrows public void testMainBodyDetection() { - String fileName = "files/211.pdf"; + String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString(); List sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java index f256e66..6a5582f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java @@ -20,7 +20,7 @@ class PageInformationServiceTest { @SneakyThrows public void testGapDetection() { - String filename = "files/211.pdf"; + String filename = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); @@ -43,7 +43,7 @@ class PageInformationServiceTest { @SneakyThrows public void testLineDetection() { - String filename = "files/211.pdf"; + String filename = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java index 5f3858c..a4b2442 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java @@ -21,7 +21,7 @@ class TextPositionSequenceSorterTest { @SneakyThrows public void testTextPositionSequenceExtraction() { - String fileName = "files/211.pdf"; + String fileName = "files/invisible_tables/test-two-pages_ocred.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); List textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/invisible_tables/test-two-pages_ocred.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/invisible_tables/test-two-pages_ocred.pdf new file mode 100644 index 0000000..018e348 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/invisible_tables/test-two-pages_ocred.pdf differ