77 lines
2.8 KiB
Plaintext
77 lines
2.8 KiB
Plaintext
Processing service interface
|
|
|
|
image classification now : JSON (Mdat PDF) -> (Data PDF -> JSON [Mdat ImObj]
|
|
image classification future: JSON [Mdat FunkIm] | Mdat PDF -> (Data [FunkIm] -> JSON [Mdat FunkIm])
|
|
object detection : JSON [Mdat PagIm] | Mdat PDF -> (Data [PagIm] -> JSON [[Mdat SemIm]])
|
|
NER : JSON [Mdat Dict] -> (Data [Dict] -> JSON [Mdat])
|
|
table parsing : JSON [Mdat FunkIm] | Mdat PDF -> (Data [PagIm] -> JSON [[Mdat FunkIm]])
|
|
pdf2image : Mdat (fn, [Int], PDF) -> (JSON ([Int], Data PDF) -> [(FunkIm, Mdat)])
|
|
|
|
|
|
image classification now : Mdat (fn, [Int], file) -> (Data PDF -> JSON [Mdat ImObj]
|
|
image classification future: Mdat (fn, [Int], dir) -> (Data [FunkIm] -> JSON [Mdat FunkIm])
|
|
object detection : Mdat (fn, [Int], dir) -> (Data [PagIm] -> JSON [[Mdat SemIm]])
|
|
table parsing : Mdat (fn, [Int], dir) -> (Data [PagIm] -> JSON [[Mdat FunkIm]])
|
|
NER : Mdat (fn, [Int], file) -> (Data [Dict] -> JSON [Mdat])
|
|
pdf2image : Mdat (fn, [Int], file) -> (JSON ([Int], Data PDF) -> [(FunkIm, Mdat)])
|
|
|
|
|
|
from funcy import identity
|
|
|
|
access(mdat):
|
|
if mdat.path is file:
|
|
request = {"data": load(mdat.path), "metadata": mdat}
|
|
elif mdat.path is dir:
|
|
get_indexed = identity if not mdat.idx else itemgetter(*mdat.idx)
|
|
request = {"data": get_indexed(get_files(mdat.path)), "metadata": mdat}
|
|
else:
|
|
raise BadRequest
|
|
|
|
|
|
storage:
|
|
|
|
fileId: {
|
|
pages: [PagIm]
|
|
images: [FunkIm]
|
|
sections: gz
|
|
}
|
|
|
|
|
|
---------------
|
|
|
|
|
|
|
|
assert if targetPath is file then response list must be singleton
|
|
{index: [], dir: fileID.pdf.gz, targetPath: fileID.images.json.gz} -> [{data: pdf bytes, metadata: request: ...] -> [{data: null, metadata: request: null, response: {classification infos: ...}]
|
|
image classification now : Mdat (fn, [Int], file) -> [JSON (Data PDF, Mdat)] -> [JSON (Data null, Mdat [ImObj])] | 1 -> 1
|
|
assert if targetPath is file then response list must be singleton
|
|
{index: [], dir: fileID/images, targetPath: fileID.images.json.gz} -> [{data: image bytes, metadata: request: {image location...}] -> [{data: null, metadata: request: null, response: {classification infos: ...}]
|
|
image classification future: Mdat (fn, [Int], dir) -> JSON (Data [FunkIm], Mdat) -> [JSON (Data null, Mdat [FunkIm])] |
|
|
object detection : Mdat (fn, [Int], dir) -> (Data [PagIm] -> JSON [[Mdat SemIm]])
|
|
table parsing : Mdat (fn, [Int], dir) -> (Data [PagIm] -> JSON [[Mdat FunkIm]])
|
|
NER : Mdat (fn, [Int], file) -> (Data [Dict] -> JSON [Mdat])
|
|
pdf2image : Mdat (fn, [Int], file) -> (JSON ([Int], Data PDF) -> [(FunkIm, Mdat)])
|
|
|
|
aggregate <==> targetpath is file and index is empty
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|