Merge branch 'table_lines' into 'master'
feat: table line inference (experimental for deployment) See merge request redactmanager/cv-analysis-service!10
This commit is contained in:
commit
f213a16cd0
@ -5,3 +5,9 @@
|
||||
port = 22
|
||||
['remote "azure_remote"']
|
||||
url = azure://cv-sa-dvc/
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
|
||||
['remote "local"']
|
||||
url = ../dvc_local_remote
|
||||
|
||||
|
||||
|
||||
|
||||
56
README.md
56
README.md
@ -1,8 +1,59 @@
|
||||
# cv-analysis — Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
@ -31,10 +82,9 @@ The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
@ -0,0 +1,61 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1710146030,
|
||||
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
33
flake.nix
Normal file
33
flake.nix
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
|
||||
inputs = {
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
};
|
||||
outputs = {
|
||||
self,
|
||||
nixpkgs,
|
||||
flake-utils,
|
||||
}:
|
||||
flake-utils.lib.eachDefaultSystem (system: let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
fhsEnv =
|
||||
(pkgs.buildFHSUserEnv rec {
|
||||
name = "cv-analysis-service";
|
||||
targetPkgs = pkgs: (with pkgs; [
|
||||
poppler_utils
|
||||
zlib
|
||||
poetry
|
||||
libuuid
|
||||
# add the system package here that are needed for the Python package dependencies
|
||||
libz # needed for 'numpy'
|
||||
]);
|
||||
profile = ''
|
||||
export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
|
||||
poetry install # add --no-root here if this is just a metapackage
|
||||
source "$(poetry env info --path)"/bin/activate
|
||||
'';
|
||||
})
|
||||
.env;
|
||||
in {devShells.default = fhsEnv;});
|
||||
}
|
||||
1574
poetry.lock
generated
1574
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -25,10 +25,11 @@ coverage = "^5.5"
|
||||
dependency-check = "^0.6.0"
|
||||
lorem-text = "^2.1"
|
||||
PyMuPDF = "^1.19.6"
|
||||
pyinfra = { version = "^2.1.0", source = "gitlab-research" }
|
||||
pyinfra = { version = "^2.2.0", source = "gitlab-research" }
|
||||
kn-utils = { version = "0.2.7", source = "gitlab-research" }
|
||||
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
||||
dvc-azure = "^2.21.2"
|
||||
pymupdf = "^1.24.1"
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.0.1"
|
||||
|
||||
29
scripts/parse_pdf.py
Normal file
29
scripts/parse_pdf.py
Normal file
@ -0,0 +1,29 @@
|
||||
import json
|
||||
|
||||
from cv_analysis.server.pipeline import make_image_analysis_pipeline
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
from cv_analysis.utils.annotate import annotate_pdf
|
||||
|
||||
pipe = make_image_analysis_pipeline(infer_lines)
|
||||
|
||||
|
||||
def parse_args():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("pdf", type=str, help="Path to the PDF file")
|
||||
parser.add_argument("vlp_output", type=str, help="Path to the VLP output JSON file")
|
||||
parser.add_argument("--output", type=str, help="Path to the output PDF file", default="/tmp/output.pdf")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
args = parse_args()
|
||||
|
||||
pdf_bytes = open(args.pdf, "rb").read()
|
||||
vlp_output = json.load(open(args.vlp_output, "r"))
|
||||
|
||||
best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output}))
|
||||
|
||||
# print(best_result)
|
||||
|
||||
annotate_pdf(pdf_bytes, best_result, output_path=args.output)
|
||||
@ -1,6 +1,6 @@
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
from operator import itemgetter, truth
|
||||
from typing import Generator, Callable
|
||||
|
||||
from funcy import flatten, lmap
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
@ -8,7 +8,9 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
from cv_analysis.table_parsing import parse_lines, parse_tables
|
||||
from cv_analysis.utils.image_extraction import extract_images_from_pdf, transform_table_lines_by_page_info
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
|
||||
|
||||
@ -28,22 +30,45 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
|
||||
skip_pages_without_images=table_parsing_skip_pages_without_images,
|
||||
)
|
||||
if operation == "figure":
|
||||
return make_analysis_pipeline(
|
||||
detect_figures, figure_detection_formatter, dpi=200
|
||||
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
|
||||
if operation == "table_image_inference": # TODO: fix pyinfra input
|
||||
return make_image_analysis_pipeline(
|
||||
infer_lines,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
# else:
|
||||
# raise
|
||||
|
||||
|
||||
def make_analysis_pipeline(
|
||||
analysis_fn, formatter, dpi, skip_pages_without_images=False
|
||||
):
|
||||
def make_image_analysis_pipeline(
|
||||
analysis_fn,
|
||||
) -> Callable[[dict], Generator[dict, bytes, None]]:
|
||||
def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]:
|
||||
pdf_bytes = data["pdf"]
|
||||
vlp_output = data["vlp_output"]
|
||||
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||
# rel_bboxes = map()
|
||||
img_results = lmap(analysis_fn, images)
|
||||
def make_offsets():
|
||||
...
|
||||
|
||||
offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info))
|
||||
# print("before", img_results)
|
||||
img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info)
|
||||
# print("after", img_results)
|
||||
results = map(lambda i: info[i] | img_results[i], range(len(info)))
|
||||
|
||||
yield from results
|
||||
|
||||
return analyse_pipeline
|
||||
|
||||
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
|
||||
def analyse_pipeline(pdf: bytes, index=None):
|
||||
def parse_page(page: ImagePlus):
|
||||
image = page.asarray()
|
||||
rects = analysis_fn(image)
|
||||
if not rects:
|
||||
return
|
||||
return None
|
||||
infos = formatter(rects, page, dpi)
|
||||
return infos
|
||||
|
||||
@ -66,9 +91,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi)
|
||||
|
||||
def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
|
||||
def format_rect(rect: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(
|
||||
*rect.xyxy(), page.info, alpha=False, dpi=dpi
|
||||
)
|
||||
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
||||
return rect_plus.asdict(derotate=True)
|
||||
|
||||
bboxes = lmap(format_rect, rects)
|
||||
@ -78,11 +101,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
|
||||
|
||||
def figure_detection_formatter(rects, page, dpi):
|
||||
def format_rect(rect: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(
|
||||
*rect.xyxy(), page.info, alpha=False, dpi=dpi
|
||||
)
|
||||
return asdict(
|
||||
ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)
|
||||
)
|
||||
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
||||
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
|
||||
|
||||
return lmap(format_rect, rects)
|
||||
|
||||
208
src/cv_analysis/table_inference.py
Normal file
208
src/cv_analysis/table_inference.py
Normal file
@ -0,0 +1,208 @@
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from kn_utils.logging import logger
|
||||
from numpy import ndarray as Array
|
||||
from scipy.stats import norm
|
||||
|
||||
|
||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
for a in arrs:
|
||||
plt.plot(a)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
|
||||
def show(arr: Array, title: str = ""):
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
plt.plot(arr)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
|
||||
def save_plot(arr: Array, name: str, title: str = "") -> None:
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
plt.plot(arr)
|
||||
plt.title(title)
|
||||
plt.savefig(Path(str(name) + ".png"))
|
||||
|
||||
|
||||
def save_lines(img: Array, lines: list[dict[str, int]]) -> None:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
|
||||
getter = itemgetter("x1", "y1", "x2", "y2")
|
||||
for line in lines:
|
||||
x1, y1, x2, y2 = getter(line)
|
||||
img = cv2.line(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=3)
|
||||
cv2.imwrite("/tmp/lines.png", img)
|
||||
|
||||
|
||||
def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
xvals = np.arange(-wing_size, wing_size + 1)
|
||||
kernel = norm.pdf(xvals, scale=sd)
|
||||
kernel /= np.sum(kernel)
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
xvals = np.arange(-wing_size, wing_size + 1)
|
||||
kernel = norm.pdf(xvals, scale=sd)
|
||||
kernel /= np.sum(kernel)
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
kernel = np.array(
|
||||
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
|
||||
)
|
||||
maxval, minval = np.max(kernel), np.min(kernel)
|
||||
diff = maxval - minval
|
||||
kernel += diff / (1 - ratio)
|
||||
kernel /= np.sum(kernel)
|
||||
return kernel
|
||||
|
||||
|
||||
def min_avg_for_interval(filtered: Array, interval: int) -> float:
|
||||
n = len(filtered)
|
||||
avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
|
||||
best = min(avgs)
|
||||
return best, avgs.index(best)
|
||||
|
||||
|
||||
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
||||
performance = [
|
||||
(interval, *min_avg_for_interval(filtered, interval))
|
||||
for interval in range(min_interval, max_interval + 1)
|
||||
]
|
||||
best = min(performance, key=lambda x: x[1])
|
||||
return best[0], best[2]
|
||||
|
||||
|
||||
def filter_array(
|
||||
array: Array,
|
||||
sum_filter: Array,
|
||||
padding: Optional[Array] = None,
|
||||
pad_value_function: Callable[[Array], float] = lambda x: 255.0, # np.mean,
|
||||
) -> Array:
|
||||
if sum_filter is None:
|
||||
return array
|
||||
fsize = len(sum_filter)
|
||||
assert fsize % 2
|
||||
if padding is None: # ensures that output size matches the input size
|
||||
pad = int((fsize - 1) / 2)
|
||||
padding = np.full(pad, pad_value_function(array))
|
||||
|
||||
return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
|
||||
|
||||
|
||||
ROW_FILTER1_WIDTH = 30
|
||||
ROW_FILTER1_SD = 6
|
||||
ROW_FILTER2_WIDTH = 20
|
||||
ROW_FILTER2_SD = 4
|
||||
COL_FILTER1_WIDTH = 90
|
||||
COL_FILTER1_SD = 15
|
||||
COL_FILTER2_WIDTH = 70
|
||||
COL_FILTER2_SD = 12
|
||||
COL_FILTER3_WIDTH = 200
|
||||
COL_FILTER3_SD = 20
|
||||
FILTERS = {
|
||||
"row": {
|
||||
1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
|
||||
2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
|
||||
3: None,
|
||||
},
|
||||
"col": {
|
||||
1: make_gaussian_kernel(COL_FILTER1_WIDTH, COL_FILTER1_SD),
|
||||
2: make_gaussian_kernel(COL_FILTER2_WIDTH, COL_FILTER2_SD),
|
||||
3: make_gaussian_kernel(COL_FILTER3_WIDTH, COL_FILTER3_SD),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
|
||||
if not line_list:
|
||||
return []
|
||||
centers = list(
|
||||
np.where(
|
||||
(filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:])
|
||||
)[0]
|
||||
+ 1
|
||||
)
|
||||
|
||||
if line_list[0] > centers[0]:
|
||||
centers = centers[1:] + [len(filt_sums) - 1]
|
||||
mindiff = np.std(filt_sums)
|
||||
line_list = [
|
||||
maxidx
|
||||
for maxidx, minidx in zip(line_list, centers)
|
||||
if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
|
||||
]
|
||||
return line_list
|
||||
|
||||
|
||||
def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
||||
key = "row" if horizontal else "col"
|
||||
|
||||
filters = FILTERS
|
||||
sums = np.mean(table_array, axis=int(horizontal))
|
||||
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
|
||||
predicate = 1000.0 * (sums < threshold)
|
||||
sums = np.maximum(
|
||||
np.maximum(sums[1:-1], predicate[1:-1]),
|
||||
np.maximum(predicate[:-2], predicate[2:]),
|
||||
)
|
||||
filtered_sums = filter_array(sums, FILTERS[key][1])
|
||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
|
||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
|
||||
|
||||
lines = list(
|
||||
np.where(
|
||||
(filtered_sums[1:-1] > filtered_sums[:-2])
|
||||
* (filtered_sums[1:-1] > filtered_sums[2:])
|
||||
)[0]
|
||||
+ 1
|
||||
)
|
||||
if not horizontal:
|
||||
lines = filter_fp_col_lines(lines, filtered_sums)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def img_bytes_to_array(img_bytes: bytes) -> Array:
|
||||
img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
|
||||
return img_np
|
||||
|
||||
|
||||
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
|
||||
cv2.imwrite("/tmp/table.png", img)
|
||||
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
|
||||
cv2.imwrite("/tmp/table_bin.png", img)
|
||||
h, w = map(int, img.shape)
|
||||
row_vals = map(int, get_lines_either(img, horizontal=True))
|
||||
col_vals = map(int, get_lines_either(img, horizontal=False))
|
||||
|
||||
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
|
||||
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
|
||||
]
|
||||
|
||||
save_lines(img, lines)
|
||||
|
||||
return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}
|
||||
@ -189,6 +189,7 @@ def detect_endpoints(
|
||||
points = points if points is not None else []
|
||||
|
||||
lines = list(map(lambda x: tuple(x[0]), points))
|
||||
|
||||
if not lines:
|
||||
return lines
|
||||
index = int(is_horizontal)
|
||||
|
||||
67
src/cv_analysis/utils/annotate.py
Normal file
67
src/cv_analysis/utils/annotate.py
Normal file
@ -0,0 +1,67 @@
|
||||
from functools import singledispatch
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import fitz
|
||||
from kn_utils.logging import logger
|
||||
|
||||
|
||||
def annotate_pdf(
|
||||
pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None
|
||||
):
|
||||
pdf_bytes = provide_byte_stream(pdf)
|
||||
with fitz.open(stream=pdf_bytes) as pdf_handle:
|
||||
for page_annotations in annotations:
|
||||
index = page_annotations["pageNum"]
|
||||
annotate_page(pdf_handle[index], page_annotations)
|
||||
output_path = output_path or "/tmp/annotated.pdf"
|
||||
pdf_handle.save(output_path)
|
||||
logger.info(f"Annotated PDF saved to {output_path}")
|
||||
|
||||
|
||||
def annotate_page(page: fitz.Page, prediction):
|
||||
for box in prediction.get("boxes", []):
|
||||
bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
|
||||
label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
|
||||
|
||||
bbox = mirror_on_x_axis(bbox, page.bound().height)
|
||||
x0, y0, x1, y1 = bbox
|
||||
page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
|
||||
label_x, label_y = x0, y0 - 5
|
||||
page.insert_text(
|
||||
(label_x, label_y),
|
||||
f"{label} ({probability:.2f}), {uuid}",
|
||||
fontsize=12,
|
||||
color=(0.4, 0.4, 1),
|
||||
)
|
||||
for line in prediction.get("tableLines", []):
|
||||
start = itemgetter("x1", "y1")(line)
|
||||
end = itemgetter("x2", "y2")(line)
|
||||
page.draw_line(start, end, color=(1, 0, 0.5), width=1)
|
||||
return page
|
||||
|
||||
|
||||
def mirror_on_x_axis(bbox, page_height):
|
||||
x0, y0, x1, y1 = bbox
|
||||
y0_new = page_height - y1
|
||||
y1_new = page_height - y0
|
||||
|
||||
return x0, y0_new, x1, y1_new
|
||||
|
||||
|
||||
@singledispatch
|
||||
def provide_byte_stream(pdf: Union[bytes, Path, str]) -> bytes:
|
||||
pass
|
||||
|
||||
|
||||
@provide_byte_stream.register(bytes)
|
||||
def _(pdf):
|
||||
return pdf
|
||||
|
||||
|
||||
@provide_byte_stream.register(str)
|
||||
@provide_byte_stream.register(Path)
|
||||
def _(pdf):
|
||||
with open(pdf, "rb") as pdf_file:
|
||||
return pdf_file.read()
|
||||
@ -1,6 +1,11 @@
|
||||
import os
|
||||
import cv2
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
if os.environ["USER"] == "isaac":
|
||||
import matplotlib
|
||||
matplotlib.use('module://matplotlib-backend-wezterm')
|
||||
|
||||
|
||||
def show_image_cv2(image, maxdim=700):
|
||||
h, w, c = image.shape
|
||||
|
||||
147
src/cv_analysis/utils/image_extraction.py
Normal file
147
src/cv_analysis/utils/image_extraction.py
Normal file
@ -0,0 +1,147 @@
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from operator import itemgetter
|
||||
from typing import Iterable, Tuple
|
||||
|
||||
import fitz
|
||||
import numpy as np
|
||||
from funcy import compose, lfilter
|
||||
from kn_utils.logging import logger
|
||||
from numpy import ndarray as Array
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageInfo:
|
||||
page_num: int
|
||||
rotation_matrix: fitz.Matrix
|
||||
transformation_matrix: fitz.Matrix
|
||||
dpi: int
|
||||
width: int | float
|
||||
height: int | float
|
||||
image_width: int | float
|
||||
image_height: int | float
|
||||
rotation: int
|
||||
|
||||
|
||||
def transform_image_coordinates_to_pdf_coordinates(
|
||||
bbox: Iterable[int | float],
|
||||
rotation_matrix: fitz.Matrix,
|
||||
transformation_matrix: fitz.Matrix,
|
||||
dpi: int = None,
|
||||
) -> Tuple:
|
||||
x1, y1, x2, y2 = (
|
||||
map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox
|
||||
) # Convert to points, can be done before
|
||||
rect = fitz.Rect(x1, y1, x2, y2)
|
||||
rect = rect * rotation_matrix * transformation_matrix
|
||||
|
||||
return rect.x0, rect.y0, rect.x1, rect.y1
|
||||
|
||||
|
||||
def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]:
|
||||
pdf_h, pdf_w = page_info.height, page_info.width
|
||||
if page_info.rotation in {90, 270}:
|
||||
pdf_h, pdf_w = pdf_w, pdf_h
|
||||
pix_h, pix_w = page_info.image_height, page_info.image_width
|
||||
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
|
||||
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
|
||||
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3(
|
||||
(ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)
|
||||
)
|
||||
new_bbox = round3(
|
||||
(bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)
|
||||
)
|
||||
return new_bbox
|
||||
|
||||
|
||||
def transform_table_lines_by_page_info(
|
||||
bboxes: dict, offsets: tuple, page_info: PageInfo
|
||||
) -> dict:
|
||||
transform = partial(rescale_to_pdf, page_info=page_info)
|
||||
logger.debug(f"{offsets=}")
|
||||
|
||||
def apply_offsets(line: tuple) -> tuple:
|
||||
x1, y1, x2, y2 = line
|
||||
offset_x, offset_y = offsets
|
||||
offset_y = page_info.height - offset_y
|
||||
logger.debug((f"new offsets: {offset_x}, {offset_y}"))
|
||||
|
||||
return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
|
||||
|
||||
unpack = itemgetter("x1", "y1", "x2", "y2")
|
||||
pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
|
||||
convert = compose(pack, apply_offsets, transform, unpack)
|
||||
|
||||
table_lines = bboxes.get("tableLines", [])
|
||||
transformed_lines = list(map(convert, table_lines))
|
||||
bboxes[
|
||||
"tableLines"
|
||||
] = transformed_lines # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
|
||||
import json
|
||||
|
||||
for i in range(len(table_lines)):
|
||||
logger.debug(json.dumps(table_lines[i], indent=4))
|
||||
logger.debug(json.dumps(transformed_lines[i], indent=4))
|
||||
logger.debug("")
|
||||
|
||||
return bboxes
|
||||
|
||||
|
||||
def extract_images_from_pdf(
|
||||
pdf_bytes: bytes, vlp_output: dict, dpi: int = 200
|
||||
) -> tuple[list[Array], list[dict], list[PageInfo]]:
|
||||
with fitz.open(stream=pdf_bytes) as fh:
|
||||
table_images = []
|
||||
table_info = []
|
||||
page_info = []
|
||||
|
||||
vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output
|
||||
|
||||
for page_dict in vlp_output:
|
||||
page_num = int(page_dict["page_idx"])
|
||||
boxes = page_dict["boxes"]
|
||||
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
|
||||
|
||||
page = fh[page_num]
|
||||
page.wrap_contents()
|
||||
|
||||
page_image = page.get_pixmap(dpi=200)
|
||||
current_page_info = PageInfo(
|
||||
page_num,
|
||||
page.rotation_matrix,
|
||||
page.transformation_matrix,
|
||||
dpi,
|
||||
*page.rect[-2:],
|
||||
page_image.w,
|
||||
page_image.h,
|
||||
page.rotation,
|
||||
)
|
||||
|
||||
for box_obj in boxes:
|
||||
bbox = box_obj["box"]
|
||||
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
|
||||
rect = fitz.Rect((x1, y1), (x2, y2))
|
||||
# FIXME: Check if de-rotation works as intended and is necessary at all.
|
||||
# Note that there exists also a derotation_matrix. If changing this, also change the
|
||||
# current_page_info object to include the derotation_matrix.
|
||||
rect = rect * page.transformation_matrix * page.rotation_matrix
|
||||
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
||||
shape = (
|
||||
(pixmap.h, pixmap.w, pixmap.n)
|
||||
if pixmap.n > 1
|
||||
else (pixmap.h, pixmap.w)
|
||||
)
|
||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
|
||||
|
||||
table_images.append(image)
|
||||
table_info.append(
|
||||
{
|
||||
"pageNum": page_num,
|
||||
"bbox": bbox,
|
||||
"uuid": box_obj["uuid"],
|
||||
"label": box_obj["label"],
|
||||
}
|
||||
)
|
||||
page_info.append(current_page_info)
|
||||
|
||||
return table_images, table_info, page_info
|
||||
@ -1,12 +1,11 @@
|
||||
from numpy import array, ndarray
|
||||
import pdf2image
|
||||
from numpy import array, ndarray
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.preprocessing import preprocess_page_array
|
||||
|
||||
|
||||
def open_pdf(pdf, first_page=0, last_page=None):
|
||||
|
||||
first_page += 1
|
||||
last_page = None if last_page is None else last_page + 1
|
||||
|
||||
@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
|
||||
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
pages = [Image.open(pdf)]
|
||||
elif pdf.lower().endswith(".pdf"):
|
||||
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
|
||||
pages = pdf2image.convert_from_path(
|
||||
pdf, first_page=first_page, last_page=last_page
|
||||
)
|
||||
else:
|
||||
raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
|
||||
raise IOError(
|
||||
"Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
|
||||
)
|
||||
elif type(pdf) == bytes:
|
||||
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
|
||||
pages = pdf2image.convert_from_bytes(
|
||||
pdf, first_page=first_page, last_page=last_page
|
||||
)
|
||||
elif type(pdf) in {list, ndarray}:
|
||||
return pdf
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from sys import stdout
|
||||
from typing import Union
|
||||
|
||||
from kn_utils.logging import logger
|
||||
from pyinfra.examples import start_standard_queue_consumer
|
||||
@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level)
|
||||
def make_dispatched_data_analysis(config):
|
||||
skip_pages_without_images = config.table_parsing.skip_pages_without_images
|
||||
|
||||
def inner(data: bytes, message: dict) -> list:
|
||||
def inner(data: Union[dict, bytes], message: dict) -> list:
|
||||
operation = message["operation"]
|
||||
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
|
||||
return list(analyse(data))
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
outs:
|
||||
- md5: f74c866991f90b519dd334980ce0d495.dir
|
||||
size: 2832497
|
||||
nfiles: 21
|
||||
- md5: d8630d20056547025abbabc895f6f62a.dir
|
||||
size: 4715796
|
||||
nfiles: 22
|
||||
path: test_data
|
||||
|
||||
@ -78,7 +78,7 @@ def formatter(operation):
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize("operation", ["table_cells", "figure"])
|
||||
@pytest.mark.parametrize("operation", ["figure"])
|
||||
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
|
||||
analysis_pipeline = make_analysis_pipeline(
|
||||
analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False
|
||||
|
||||
24
test/unit_tests/table_inference_test.py
Normal file
24
test/unit_tests/table_inference_test.py
Normal file
@ -0,0 +1,24 @@
|
||||
from cv_analysis.server.pipeline import make_image_analysis_pipeline
|
||||
from cv_analysis.table_inference import infer_lines
|
||||
|
||||
|
||||
def test_table_inference_smoke():
|
||||
pl = make_image_analysis_pipeline(infer_lines)
|
||||
with open("test/test_data/article.pdf", "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
vlp_mock = {
|
||||
"data": [
|
||||
{
|
||||
"page_idx": 1,
|
||||
"boxes": [
|
||||
{"uuid": "marius-marius-gib-mir-meine-legionen-wieder", "label": "table", "box": {"x1": 100, "y1": 100, "x2": 200, "y2": 200}}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
data = {"pdf": pdf_bytes, "vlp_output": vlp_mock}
|
||||
output = list(pl(data))
|
||||
if output:
|
||||
lines = output[0]["tableLines"]
|
||||
# assert len(lines) > 1
|
||||
# assert all(map(lambda item: sorted(item.keys()) == ["x1", "x2", "y1", "y2"], lines))
|
||||
Loading…
x
Reference in New Issue
Block a user