Merge branch 'table_lines' into 'master'

feat: table line inference (experimental for deployment)

See merge request redactmanager/cv-analysis-service!10
This commit is contained in:
Julius Unverfehrt 2024-04-26 15:14:51 +02:00
commit f213a16cd0
18 changed files with 1469 additions and 828 deletions

View File

@ -5,3 +5,9 @@
port = 22
['remote "azure_remote"']
url = azure://cv-sa-dvc/
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
['remote "local"']
url = ../dvc_local_remote

View File

@ -1,8 +1,59 @@
# cv-analysis — Visual (CV-Based) Document Parsing
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## API
Input message:
```json
{
"targetFilePath": {
"pdf": "absolute file path",
"vlp_output": "absolute file path"
},
"responseFilePath": "absolute file path",
"operation": "table_image_inference"
}
```
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
```json
{
...,
"data": [
{
'pageNum': 0,
'bbox': {
'x1': 55.3407,
'y1': 247.0246,
'x2': 558.5602,
'y2': 598.0585
},
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
'label': 'table',
'tableLines': [
{
'x1': 0,
'y1': 16,
'x2': 1399,
'y2': 16
},
...
],
'imageInfo': {
'height': 693,
'width': 1414
}
},
...
]
}
```
## Installation
```bash
@ -31,10 +82,9 @@ The below snippet shows hot to find the outlines of previous redactions.
```python
from cv_analysis.redaction_detection import find_redactions
import pdf2image
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...

61
flake.lock generated Normal file
View File

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

33
flake.nix Normal file
View File

@ -0,0 +1,33 @@
{
description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
inputs = {
flake-utils.url = "github:numtide/flake-utils";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
};
outputs = {
self,
nixpkgs,
flake-utils,
}:
flake-utils.lib.eachDefaultSystem (system: let
pkgs = nixpkgs.legacyPackages.${system};
fhsEnv =
(pkgs.buildFHSUserEnv rec {
name = "cv-analysis-service";
targetPkgs = pkgs: (with pkgs; [
poppler_utils
zlib
poetry
libuuid
# add the system package here that are needed for the Python package dependencies
libz # needed for 'numpy'
]);
profile = ''
export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
poetry install # add --no-root here if this is just a metapackage
source "$(poetry env info --path)"/bin/activate
'';
})
.env;
in {devShells.default = fhsEnv;});
}

1574
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,10 +25,11 @@ coverage = "^5.5"
dependency-check = "^0.6.0"
lorem-text = "^2.1"
PyMuPDF = "^1.19.6"
pyinfra = { version = "^2.1.0", source = "gitlab-research" }
pyinfra = { version = "^2.2.0", source = "gitlab-research" }
kn-utils = { version = "0.2.7", source = "gitlab-research" }
pdf2img = { version = "0.7.0", source = "gitlab-red" }
dvc-azure = "^2.21.2"
pymupdf = "^1.24.1"
[tool.poetry.group.test.dependencies]
pytest = "^7.0.1"

29
scripts/parse_pdf.py Normal file
View File

@ -0,0 +1,29 @@
import json
from cv_analysis.server.pipeline import make_image_analysis_pipeline
from cv_analysis.table_inference import infer_lines
from cv_analysis.utils.annotate import annotate_pdf
pipe = make_image_analysis_pipeline(infer_lines)
def parse_args():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("pdf", type=str, help="Path to the PDF file")
parser.add_argument("vlp_output", type=str, help="Path to the VLP output JSON file")
parser.add_argument("--output", type=str, help="Path to the output PDF file", default="/tmp/output.pdf")
return parser.parse_args()
args = parse_args()
pdf_bytes = open(args.pdf, "rb").read()
vlp_output = json.load(open(args.vlp_output, "r"))
best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output}))
# print(best_result)
annotate_pdf(pdf_bytes, best_result, output_path=args.output)

View File

@ -1,6 +1,6 @@
import sys
from dataclasses import asdict
from operator import truth
from operator import itemgetter, truth
from typing import Generator, Callable
from funcy import flatten, lmap
from pdf2img.conversion import convert_pages_to_images
@ -8,7 +8,9 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
from pdf2img.default_objects.rectangle import RectanglePlus
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_inference import infer_lines
from cv_analysis.table_parsing import parse_lines, parse_tables
from cv_analysis.utils.image_extraction import extract_images_from_pdf, transform_table_lines_by_page_info
from cv_analysis.utils.structures import Rectangle
@ -28,22 +30,45 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
skip_pages_without_images=table_parsing_skip_pages_without_images,
)
if operation == "figure":
return make_analysis_pipeline(
detect_figures, figure_detection_formatter, dpi=200
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
if operation == "table_image_inference": # TODO: fix pyinfra input
return make_image_analysis_pipeline(
infer_lines,
)
else:
raise
# else:
# raise
def make_analysis_pipeline(
analysis_fn, formatter, dpi, skip_pages_without_images=False
):
def make_image_analysis_pipeline(
analysis_fn,
) -> Callable[[dict], Generator[dict, bytes, None]]:
def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]:
pdf_bytes = data["pdf"]
vlp_output = data["vlp_output"]
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
# rel_bboxes = map()
img_results = lmap(analysis_fn, images)
def make_offsets():
...
offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info))
# print("before", img_results)
img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info)
# print("after", img_results)
results = map(lambda i: info[i] | img_results[i], range(len(info)))
yield from results
return analyse_pipeline
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
return None
infos = formatter(rects, page, dpi)
return infos
@ -66,9 +91,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi)
def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(
*rect.xyxy(), page.info, alpha=False, dpi=dpi
)
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return rect_plus.asdict(derotate=True)
bboxes = lmap(format_rect, rects)
@ -78,11 +101,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(
*rect.xyxy(), page.info, alpha=False, dpi=dpi
)
return asdict(
ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)
)
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
return lmap(format_rect, rects)

View File

@ -0,0 +1,208 @@
from operator import itemgetter
from pathlib import Path
from typing import Callable, Optional, Tuple
import cv2
import matplotlib.pyplot as plt
import numpy as np
from kn_utils.logging import logger
from numpy import ndarray as Array
from scipy.stats import norm
def show_multiple(arrs: Tuple[Array], title: str = ""):
plt.clf()
plt.cla()
plt.close()
for a in arrs:
plt.plot(a)
plt.title(title)
plt.show()
def show(arr: Array, title: str = ""):
plt.clf()
plt.cla()
plt.close()
plt.plot(arr)
plt.title(title)
plt.show()
def save_plot(arr: Array, name: str, title: str = "") -> None:
plt.clf()
plt.cla()
plt.close()
plt.plot(arr)
plt.title(title)
plt.savefig(Path(str(name) + ".png"))
def save_lines(img: Array, lines: list[dict[str, int]]) -> None:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
getter = itemgetter("x1", "y1", "x2", "y2")
for line in lines:
x1, y1, x2, y2 = getter(line)
img = cv2.line(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=3)
cv2.imwrite("/tmp/lines.png", img)
def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd)
kernel /= np.sum(kernel)
return kernel
def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd)
kernel /= np.sum(kernel)
return kernel
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2)
kernel = np.array(
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
)
maxval, minval = np.max(kernel), np.min(kernel)
diff = maxval - minval
kernel += diff / (1 - ratio)
kernel /= np.sum(kernel)
return kernel
def min_avg_for_interval(filtered: Array, interval: int) -> float:
n = len(filtered)
avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
best = min(avgs)
return best, avgs.index(best)
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
performance = [
(interval, *min_avg_for_interval(filtered, interval))
for interval in range(min_interval, max_interval + 1)
]
best = min(performance, key=lambda x: x[1])
return best[0], best[2]
def filter_array(
array: Array,
sum_filter: Array,
padding: Optional[Array] = None,
pad_value_function: Callable[[Array], float] = lambda x: 255.0, # np.mean,
) -> Array:
if sum_filter is None:
return array
fsize = len(sum_filter)
assert fsize % 2
if padding is None: # ensures that output size matches the input size
pad = int((fsize - 1) / 2)
padding = np.full(pad, pad_value_function(array))
return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
ROW_FILTER1_WIDTH = 30
ROW_FILTER1_SD = 6
ROW_FILTER2_WIDTH = 20
ROW_FILTER2_SD = 4
COL_FILTER1_WIDTH = 90
COL_FILTER1_SD = 15
COL_FILTER2_WIDTH = 70
COL_FILTER2_SD = 12
COL_FILTER3_WIDTH = 200
COL_FILTER3_SD = 20
FILTERS = {
"row": {
1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
3: None,
},
"col": {
1: make_gaussian_kernel(COL_FILTER1_WIDTH, COL_FILTER1_SD),
2: make_gaussian_kernel(COL_FILTER2_WIDTH, COL_FILTER2_SD),
3: make_gaussian_kernel(COL_FILTER3_WIDTH, COL_FILTER3_SD),
},
}
def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
if not line_list:
return []
centers = list(
np.where(
(filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:])
)[0]
+ 1
)
if line_list[0] > centers[0]:
centers = centers[1:] + [len(filt_sums) - 1]
mindiff = np.std(filt_sums)
line_list = [
maxidx
for maxidx, minidx in zip(line_list, centers)
if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
]
return line_list
def get_lines_either(table_array: Array, horizontal=True) -> Array:
key = "row" if horizontal else "col"
filters = FILTERS
sums = np.mean(table_array, axis=int(horizontal))
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
predicate = 1000.0 * (sums < threshold)
sums = np.maximum(
np.maximum(sums[1:-1], predicate[1:-1]),
np.maximum(predicate[:-2], predicate[2:]),
)
filtered_sums = filter_array(sums, FILTERS[key][1])
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
lines = list(
np.where(
(filtered_sums[1:-1] > filtered_sums[:-2])
* (filtered_sums[1:-1] > filtered_sums[2:])
)[0]
+ 1
)
if not horizontal:
lines = filter_fp_col_lines(lines, filtered_sums)
return lines
def img_bytes_to_array(img_bytes: bytes) -> Array:
img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
return img_np
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
cv2.imwrite("/tmp/table.png", img)
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
cv2.imwrite("/tmp/table_bin.png", img)
h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False))
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
]
save_lines(img, lines)
return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}

View File

@ -189,6 +189,7 @@ def detect_endpoints(
points = points if points is not None else []
lines = list(map(lambda x: tuple(x[0]), points))
if not lines:
return lines
index = int(is_horizontal)

View File

@ -0,0 +1,67 @@
from functools import singledispatch
from operator import itemgetter
from pathlib import Path
from typing import Union
import fitz
from kn_utils.logging import logger
def annotate_pdf(
pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None
):
pdf_bytes = provide_byte_stream(pdf)
with fitz.open(stream=pdf_bytes) as pdf_handle:
for page_annotations in annotations:
index = page_annotations["pageNum"]
annotate_page(pdf_handle[index], page_annotations)
output_path = output_path or "/tmp/annotated.pdf"
pdf_handle.save(output_path)
logger.info(f"Annotated PDF saved to {output_path}")
def annotate_page(page: fitz.Page, prediction):
for box in prediction.get("boxes", []):
bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
bbox = mirror_on_x_axis(bbox, page.bound().height)
x0, y0, x1, y1 = bbox
page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
label_x, label_y = x0, y0 - 5
page.insert_text(
(label_x, label_y),
f"{label} ({probability:.2f}), {uuid}",
fontsize=12,
color=(0.4, 0.4, 1),
)
for line in prediction.get("tableLines", []):
start = itemgetter("x1", "y1")(line)
end = itemgetter("x2", "y2")(line)
page.draw_line(start, end, color=(1, 0, 0.5), width=1)
return page
def mirror_on_x_axis(bbox, page_height):
x0, y0, x1, y1 = bbox
y0_new = page_height - y1
y1_new = page_height - y0
return x0, y0_new, x1, y1_new
@singledispatch
def provide_byte_stream(pdf: Union[bytes, Path, str]) -> bytes:
pass
@provide_byte_stream.register(bytes)
def _(pdf):
return pdf
@provide_byte_stream.register(str)
@provide_byte_stream.register(Path)
def _(pdf):
with open(pdf, "rb") as pdf_file:
return pdf_file.read()

View File

@ -1,6 +1,11 @@
import os
import cv2
from matplotlib import pyplot as plt
if os.environ["USER"] == "isaac":
import matplotlib
matplotlib.use('module://matplotlib-backend-wezterm')
def show_image_cv2(image, maxdim=700):
h, w, c = image.shape

View File

@ -0,0 +1,147 @@
from dataclasses import dataclass
from functools import partial
from operator import itemgetter
from typing import Iterable, Tuple
import fitz
import numpy as np
from funcy import compose, lfilter
from kn_utils.logging import logger
from numpy import ndarray as Array
@dataclass
class PageInfo:
page_num: int
rotation_matrix: fitz.Matrix
transformation_matrix: fitz.Matrix
dpi: int
width: int | float
height: int | float
image_width: int | float
image_height: int | float
rotation: int
def transform_image_coordinates_to_pdf_coordinates(
bbox: Iterable[int | float],
rotation_matrix: fitz.Matrix,
transformation_matrix: fitz.Matrix,
dpi: int = None,
) -> Tuple:
x1, y1, x2, y2 = (
map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox
) # Convert to points, can be done before
rect = fitz.Rect(x1, y1, x2, y2)
rect = rect * rotation_matrix * transformation_matrix
return rect.x0, rect.y0, rect.x1, rect.y1
def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]:
pdf_h, pdf_w = page_info.height, page_info.width
if page_info.rotation in {90, 270}:
pdf_h, pdf_w = pdf_w, pdf_h
pix_h, pix_w = page_info.image_height, page_info.image_width
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3(
(ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)
)
new_bbox = round3(
(bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)
)
return new_bbox
def transform_table_lines_by_page_info(
bboxes: dict, offsets: tuple, page_info: PageInfo
) -> dict:
transform = partial(rescale_to_pdf, page_info=page_info)
logger.debug(f"{offsets=}")
def apply_offsets(line: tuple) -> tuple:
x1, y1, x2, y2 = line
offset_x, offset_y = offsets
offset_y = page_info.height - offset_y
logger.debug((f"new offsets: {offset_x}, {offset_y}"))
return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
unpack = itemgetter("x1", "y1", "x2", "y2")
pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
convert = compose(pack, apply_offsets, transform, unpack)
table_lines = bboxes.get("tableLines", [])
transformed_lines = list(map(convert, table_lines))
bboxes[
"tableLines"
] = transformed_lines # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
import json
for i in range(len(table_lines)):
logger.debug(json.dumps(table_lines[i], indent=4))
logger.debug(json.dumps(transformed_lines[i], indent=4))
logger.debug("")
return bboxes
def extract_images_from_pdf(
pdf_bytes: bytes, vlp_output: dict, dpi: int = 200
) -> tuple[list[Array], list[dict], list[PageInfo]]:
with fitz.open(stream=pdf_bytes) as fh:
table_images = []
table_info = []
page_info = []
vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output
for page_dict in vlp_output:
page_num = int(page_dict["page_idx"])
boxes = page_dict["boxes"]
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num]
page.wrap_contents()
page_image = page.get_pixmap(dpi=200)
current_page_info = PageInfo(
page_num,
page.rotation_matrix,
page.transformation_matrix,
dpi,
*page.rect[-2:],
page_image.w,
page_image.h,
page.rotation,
)
for box_obj in boxes:
bbox = box_obj["box"]
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
rect = fitz.Rect((x1, y1), (x2, y2))
# FIXME: Check if de-rotation works as intended and is necessary at all.
# Note that there exists also a derotation_matrix. If changing this, also change the
# current_page_info object to include the derotation_matrix.
rect = rect * page.transformation_matrix * page.rotation_matrix
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
shape = (
(pixmap.h, pixmap.w, pixmap.n)
if pixmap.n > 1
else (pixmap.h, pixmap.w)
)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
table_images.append(image)
table_info.append(
{
"pageNum": page_num,
"bbox": bbox,
"uuid": box_obj["uuid"],
"label": box_obj["label"],
}
)
page_info.append(current_page_info)
return table_images, table_info, page_info

View File

@ -1,12 +1,11 @@
from numpy import array, ndarray
import pdf2image
from numpy import array, ndarray
from PIL import Image
from cv_analysis.utils.preprocessing import preprocess_page_array
def open_pdf(pdf, first_page=0, last_page=None):
first_page += 1
last_page = None if last_page is None else last_page + 1
@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
pages = [Image.open(pdf)]
elif pdf.lower().endswith(".pdf"):
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
pages = pdf2image.convert_from_path(
pdf, first_page=first_page, last_page=last_page
)
else:
raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
raise IOError(
"Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
)
elif type(pdf) == bytes:
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
pages = pdf2image.convert_from_bytes(
pdf, first_page=first_page, last_page=last_page
)
elif type(pdf) in {list, ndarray}:
return pdf

View File

@ -1,4 +1,5 @@
from sys import stdout
from typing import Union
from kn_utils.logging import logger
from pyinfra.examples import start_standard_queue_consumer
@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level)
def make_dispatched_data_analysis(config):
skip_pages_without_images = config.table_parsing.skip_pages_without_images
def inner(data: bytes, message: dict) -> list:
def inner(data: Union[dict, bytes], message: dict) -> list:
operation = message["operation"]
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
return list(analyse(data))

View File

@ -1,5 +1,5 @@
outs:
- md5: f74c866991f90b519dd334980ce0d495.dir
size: 2832497
nfiles: 21
- md5: d8630d20056547025abbabc895f6f62a.dir
size: 4715796
nfiles: 22
path: test_data

View File

@ -78,7 +78,7 @@ def formatter(operation):
raise
@pytest.mark.parametrize("operation", ["table_cells", "figure"])
@pytest.mark.parametrize("operation", ["figure"])
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
analysis_pipeline = make_analysis_pipeline(
analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False

View File

@ -0,0 +1,24 @@
from cv_analysis.server.pipeline import make_image_analysis_pipeline
from cv_analysis.table_inference import infer_lines
def test_table_inference_smoke():
pl = make_image_analysis_pipeline(infer_lines)
with open("test/test_data/article.pdf", "rb") as f:
pdf_bytes = f.read()
vlp_mock = {
"data": [
{
"page_idx": 1,
"boxes": [
{"uuid": "marius-marius-gib-mir-meine-legionen-wieder", "label": "table", "box": {"x1": 100, "y1": 100, "x2": 200, "y2": 200}}
],
}
]
}
data = {"pdf": pdf_bytes, "vlp_output": vlp_mock}
output = list(pl(data))
if output:
lines = output[0]["tableLines"]
# assert len(lines) > 1
# assert all(map(lambda item: sorted(item.keys()) == ["x1", "x2", "y1", "y2"], lines))