diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java index 8995ca8..d4c6251 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java @@ -12,6 +12,7 @@ public class ParagraphData { private String text; List boldTextBoundaries; List italicTextBoundaries; + List linebreaks; private String classification; private String orientation; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java index 6d021d4..388275b 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java @@ -10,6 +10,6 @@ import lombok.Data; public class RowData { boolean header; - List cellText; + List cellText; float[] bBox; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java index c0f40ca..de18ba2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java @@ -13,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; @@ -23,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.Text public class TaasDocumentDataMapper { public static ResearchDocumentData fromDocument(Document document) { + AtomicInteger structureObjectNumber = new AtomicInteger(); List structureObjects = document.streamAllSubNodes() .filter(node -> !node.getType().equals(NodeType.TABLE_CELL)) @@ -38,24 +40,32 @@ public class TaasDocumentDataMapper { return ResearchDocumentData.builder().structureObjects(structureObjects).build(); } + public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) { return ParagraphData.builder() .boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) .italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) .text(textBlock.getSearchText()) + .linebreaks(textBlock.getLineBreaks()) .classification(classification) .orientation(textBlock.getOrientation()) .textDirection(textBlock.getTextDirection()) .build(); } + public static TableData fromTable(Table table) { - List rowData = IntStream.range(0, table.getNumberOfRows()).boxed().map(rowIdx -> table.streamRow(rowIdx).toList()).map(TaasDocumentDataMapper::fromTableCells).toList(); + List rowData = IntStream.range(0, table.getNumberOfRows()) + .boxed() + .map(rowIdx -> table.streamRow(rowIdx).toList()) + .map(TaasDocumentDataMapper::fromTableCells) + .toList(); return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows()); } + public static RowData fromTableCells(List tableCells) { if (tableCells.isEmpty()) { @@ -67,8 +77,20 @@ public class TaasDocumentDataMapper { map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); return map2; }).orElseThrow().get(firstPage); - List cellText = tableCells.stream().map(TableCell::getTextBlock).map(TextBlock::getSearchText).toList(); - return new RowData(header, cellText, toFloatArray(bBox)); + List textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList(); + return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox)); + } + + + private static Range toRange(Boundary boundary) { + + return new Range(boundary.start(), boundary.end()); + } + + + private static List toRange(List boundary) { + + return boundary.stream().map(TaasDocumentDataMapper::toRange).toList(); } @@ -101,8 +123,10 @@ public class TaasDocumentDataMapper { .build(); } + private static float[] toFloatArray(Rectangle2D bBox) { return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()}; } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java new file mode 100644 index 0000000..69f98ff --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +public class DocumentDataTests { + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf deleted file mode 100644 index 2b009d1..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf and /dev/null differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf deleted file mode 100644 index a145741..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf and /dev/null differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf deleted file mode 100644 index be18a14..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf and /dev/null differ