diff --git a/config/proto/DocumentPage.proto b/config/proto/DocumentPage.proto new file mode 100644 index 0000000..a3d518b --- /dev/null +++ b/config/proto/DocumentPage.proto @@ -0,0 +1,21 @@ +syntax = "proto3"; + +message AllDocumentPages { + + repeated DocumentPage documentPages = 1; +} + +message DocumentPage { + // The page number, starting with 1. + int32 number = 1; + + // The page height in PDF user units. + int32 height = 2; + + // The page width in PDF user units. + int32 width = 3; + + // The page rotation as specified by the PDF. + int32 rotation = 4; +} + diff --git a/config/proto/DocumentPositionData.proto b/config/proto/DocumentPositionData.proto new file mode 100644 index 0000000..8dc8493 --- /dev/null +++ b/config/proto/DocumentPositionData.proto @@ -0,0 +1,28 @@ +syntax = "proto3"; + +message AllDocumentPositionData { + + repeated DocumentPositionData documentPositionData = 1; +} + +message DocumentPositionData { + // Identifier of the text block. + int64 id = 1; + + // For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. + // This is required due to the text and position coordinates not being equal. + repeated int32 stringIdxToPositionIdx = 2; + + // The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. + // The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. + // In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates. + repeated Position positions = 3; + + // Definition of a BoundingBox that contains x, y, width, and height. + message Position { + float x = 1; + float y = 2; + float width = 3; + float height = 4; + } +} diff --git a/config/proto/DocumentStructure.proto b/config/proto/DocumentStructure.proto new file mode 100644 index 0000000..fc7aea6 --- /dev/null +++ b/config/proto/DocumentStructure.proto @@ -0,0 +1,8 @@ +syntax = "proto3"; + +import "EntryData.proto"; + +message DocumentStructure { + // The root EntryData represents the Document. + EntryData root = 1; +} diff --git a/config/proto/DocumentTextData.proto b/config/proto/DocumentTextData.proto new file mode 100644 index 0000000..9f187ce --- /dev/null +++ b/config/proto/DocumentTextData.proto @@ -0,0 +1,29 @@ +syntax = "proto3"; + +message AllDocumentTextData { + + repeated DocumentTextData documentTextData = 1; +} + +message DocumentTextData { + // Identifier of the text block. + int64 id = 1; + + // The page the text block occurs on. + int64 page = 2; + + // The text of the text block. + string searchText = 3; + + // Each text block is assigned a number on a page, starting from 0. + int32 numberOnPage = 4; + + // The text blocks are ordered, this number represents the start of the text block as a string offset. + int32 start = 5; + + // The text blocks are ordered, this number represents the end of the text block as a string offset. + int32 end = 6; + + // The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak. + repeated int32 lineBreaks = 7; +} diff --git a/config/proto/EntryData.proto b/config/proto/EntryData.proto new file mode 100644 index 0000000..09e7851 --- /dev/null +++ b/config/proto/EntryData.proto @@ -0,0 +1,27 @@ +syntax = "proto3"; + +import "LayoutEngine.proto"; +import "NodeType.proto"; + +message EntryData { + // Type of the semantic node. + NodeType type = 1; + + // Specifies the position in the parsed tree structure. + repeated int32 treeId = 2; + + // Specifies the text block IDs associated with this semantic node. + repeated int64 atomicBlockIds = 3; + + // Specifies the pages this semantic node appears on. + repeated int64 pageNumbers = 4; + + // Some semantic nodes have additional information, this information is stored in this Map. + map properties = 5; + + // All child Entries of this Entry. + repeated EntryData children = 6; + + // Describes the origin of the semantic node. + repeated LayoutEngine engines = 7; +} diff --git a/config/proto/LayoutEngine.proto b/config/proto/LayoutEngine.proto new file mode 100644 index 0000000..584da56 --- /dev/null +++ b/config/proto/LayoutEngine.proto @@ -0,0 +1,7 @@ +syntax = "proto3"; + +enum LayoutEngine { + ALGORITHM = 0; + AI = 1; + OUTLINE = 2; +} diff --git a/config/proto/NodeType.proto b/config/proto/NodeType.proto new file mode 100644 index 0000000..6cf9172 --- /dev/null +++ b/config/proto/NodeType.proto @@ -0,0 +1,14 @@ +syntax = "proto3"; + +enum NodeType { + DOCUMENT = 0; + SECTION = 1; + SUPER_SECTION = 2; + HEADLINE = 3; + PARAGRAPH = 4; + TABLE = 5; + TABLE_CELL = 6; + IMAGE = 7; + HEADER = 8; + FOOTER = 9; +} diff --git a/pyinfra/proto/DocumentPage_pb2.py b/pyinfra/proto/DocumentPage_pb2.py new file mode 100644 index 0000000..50ab439 --- /dev/null +++ b/pyinfra/proto/DocumentPage_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: DocumentPage.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'DocumentPage.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12\x44ocumentPage.proto\"8\n\x10\x41llDocumentPages\x12$\n\rdocumentPages\x18\x01 \x03(\x0b\x32\r.DocumentPage\"O\n\x0c\x44ocumentPage\x12\x0e\n\x06number\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x10\n\x08rotation\x18\x04 \x01(\x05\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'DocumentPage_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_ALLDOCUMENTPAGES']._serialized_start=22 + _globals['_ALLDOCUMENTPAGES']._serialized_end=78 + _globals['_DOCUMENTPAGE']._serialized_start=80 + _globals['_DOCUMENTPAGE']._serialized_end=159 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentPositionData_pb2.py b/pyinfra/proto/DocumentPositionData_pb2.py new file mode 100644 index 0000000..6d0a4a6 --- /dev/null +++ b/pyinfra/proto/DocumentPositionData_pb2.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: DocumentPositionData.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'DocumentPositionData.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1a\x44ocumentPositionData.proto\"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData\"\xb6\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a?\n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\r\n\x05width\x18\x03 \x01(\x02\x12\x0e\n\x06height\x18\x04 \x01(\x02\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'DocumentPositionData_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_ALLDOCUMENTPOSITIONDATA']._serialized_start=30 + _globals['_ALLDOCUMENTPOSITIONDATA']._serialized_end=108 + _globals['_DOCUMENTPOSITIONDATA']._serialized_start=111 + _globals['_DOCUMENTPOSITIONDATA']._serialized_end=293 + _globals['_DOCUMENTPOSITIONDATA_POSITION']._serialized_start=230 + _globals['_DOCUMENTPOSITIONDATA_POSITION']._serialized_end=293 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentStructure_pb2.py b/pyinfra/proto/DocumentStructure_pb2.py new file mode 100644 index 0000000..19d3312 --- /dev/null +++ b/pyinfra/proto/DocumentStructure_pb2.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: DocumentStructure.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'DocumentStructure.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +import pyinfra.proto.EntryData_pb2 as EntryData__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x17\x44ocumentStructure.proto\x1a\x0f\x45ntryData.proto\"-\n\x11\x44ocumentStructure\x12\x18\n\x04root\x18\x01 \x01(\x0b\x32\n.EntryDatab\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'DocumentStructure_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_DOCUMENTSTRUCTURE']._serialized_start=44 + _globals['_DOCUMENTSTRUCTURE']._serialized_end=89 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentTextData_pb2.py b/pyinfra/proto/DocumentTextData_pb2.py new file mode 100644 index 0000000..2945602 --- /dev/null +++ b/pyinfra/proto/DocumentTextData_pb2.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: DocumentTextData.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'DocumentTextData.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16\x44ocumentTextData.proto\"B\n\x13\x41llDocumentTextData\x12+\n\x10\x64ocumentTextData\x18\x01 \x03(\x0b\x32\x11.DocumentTextData\"\x86\x01\n\x10\x44ocumentTextData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0c\n\x04page\x18\x02 \x01(\x03\x12\x12\n\nsearchText\x18\x03 \x01(\t\x12\x14\n\x0cnumberOnPage\x18\x04 \x01(\x05\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\x12\x12\n\nlineBreaks\x18\x07 \x03(\x05\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'DocumentTextData_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_ALLDOCUMENTTEXTDATA']._serialized_start=26 + _globals['_ALLDOCUMENTTEXTDATA']._serialized_end=92 + _globals['_DOCUMENTTEXTDATA']._serialized_start=95 + _globals['_DOCUMENTTEXTDATA']._serialized_end=229 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/EntryData_pb2.py b/pyinfra/proto/EntryData_pb2.py new file mode 100644 index 0000000..a5e1b9b --- /dev/null +++ b/pyinfra/proto/EntryData_pb2.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: EntryData.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'EntryData.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +import pyinfra.proto.LayoutEngine_pb2 as LayoutEngine__pb2 +import pyinfra.proto.NodeType_pb2 as NodeType__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x45ntryData.proto\x1a\x12LayoutEngine.proto\x1a\x0eNodeType.proto\"\x82\x02\n\tEntryData\x12\x17\n\x04type\x18\x01 \x01(\x0e\x32\t.NodeType\x12\x0e\n\x06treeId\x18\x02 \x03(\x05\x12\x16\n\x0e\x61tomicBlockIds\x18\x03 \x03(\x03\x12\x13\n\x0bpageNumbers\x18\x04 \x03(\x03\x12.\n\nproperties\x18\x05 \x03(\x0b\x32\x1a.EntryData.PropertiesEntry\x12\x1c\n\x08\x63hildren\x18\x06 \x03(\x0b\x32\n.EntryData\x12\x1e\n\x07\x65ngines\x18\x07 \x03(\x0e\x32\r.LayoutEngine\x1a\x31\n\x0fPropertiesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'EntryData_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_ENTRYDATA_PROPERTIESENTRY']._loaded_options = None + _globals['_ENTRYDATA_PROPERTIESENTRY']._serialized_options = b'8\001' + _globals['_ENTRYDATA']._serialized_start=56 + _globals['_ENTRYDATA']._serialized_end=314 + _globals['_ENTRYDATA_PROPERTIESENTRY']._serialized_start=265 + _globals['_ENTRYDATA_PROPERTIESENTRY']._serialized_end=314 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/LayoutEngine_pb2.py b/pyinfra/proto/LayoutEngine_pb2.py new file mode 100644 index 0000000..4fa69cf --- /dev/null +++ b/pyinfra/proto/LayoutEngine_pb2.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: LayoutEngine.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'LayoutEngine.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12LayoutEngine.proto*2\n\x0cLayoutEngine\x12\r\n\tALGORITHM\x10\x00\x12\x06\n\x02\x41I\x10\x01\x12\x0b\n\x07OUTLINE\x10\x02\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'LayoutEngine_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_LAYOUTENGINE']._serialized_start=22 + _globals['_LAYOUTENGINE']._serialized_end=72 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/NodeType_pb2.py b/pyinfra/proto/NodeType_pb2.py new file mode 100644 index 0000000..05e6957 --- /dev/null +++ b/pyinfra/proto/NodeType_pb2.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# NO CHECKED-IN PROTOBUF GENCODE +# source: NodeType.proto +# Protobuf Python Version: 5.27.2 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import runtime_version as _runtime_version +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +_runtime_version.ValidateProtobufRuntimeVersion( + _runtime_version.Domain.PUBLIC, + 5, + 27, + 2, + '', + 'NodeType.proto' +) +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0eNodeType.proto*\x93\x01\n\x08NodeType\x12\x0c\n\x08\x44OCUMENT\x10\x00\x12\x0b\n\x07SECTION\x10\x01\x12\x11\n\rSUPER_SECTION\x10\x02\x12\x0c\n\x08HEADLINE\x10\x03\x12\r\n\tPARAGRAPH\x10\x04\x12\t\n\x05TABLE\x10\x05\x12\x0e\n\nTABLE_CELL\x10\x06\x12\t\n\x05IMAGE\x10\x07\x12\n\n\x06HEADER\x10\x08\x12\n\n\x06\x46OOTER\x10\tb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'NodeType_pb2', _globals) +if not _descriptor._USE_C_DESCRIPTORS: + DESCRIPTOR._loaded_options = None + _globals['_NODETYPE']._serialized_start=19 + _globals['_NODETYPE']._serialized_end=166 +# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/__init__.py b/pyinfra/proto/__init__.py new file mode 100644 index 0000000..e69de29