[WIP] recursive random table

This commit is contained in:
Matthias Bisping 2023-01-18 11:42:50 +01:00
parent 4d11a157e5
commit 893622a73e

View File

@ -1,26 +1,27 @@
import io
import itertools
import os
import random
import string
import textwrap
from functools import lru_cache
from functools import lru_cache, partial
from pathlib import Path
from typing import Tuple, Union, Iterable, List
import albumentations as A
import cv2 as cv
import loguru
import numpy as np
import pandas as pd
import pytest
from PIL import Image, ImageOps, ImageFont, ImageDraw
from PIL.Image import Transpose
from faker import Faker
from loguru import logger
from matplotlib import pyplot as plt
from tabulate import tabulate
from cv_analysis.table_parsing import isolate_vertical_and_horizontal_components
from cv_analysis.utils import star, rconj
from cv_analysis.utils.common import normalize_to_gray_scale
from cv_analysis.utils.merging import merge_related_rectangles
from cv_analysis.utils.postprocessing import remove_overlapping, remove_included
@ -97,7 +98,8 @@ from funcy import (
lfilter,
lzip,
keep,
lkeep,
repeatedly,
mapcat,
)
from cv_analysis.locations import TEST_PAGE_TEXTURES_DIR
@ -365,6 +367,9 @@ class ContentRectangle(Rectangle):
super().__init__(x1, y1, x2, y2)
self.content = content
def __repr__(self):
return f"{self.__class__.__name__}({self.x1}, {self.y1}, {self.x2}, {self.y2}, content={self.content})"
class ContentGenerator:
def __init__(self):
@ -380,7 +385,7 @@ class ContentGenerator:
char_boxes = lfilter(is_square_like, char_boxes)
text_boxes = merge_related_rectangles(text_boxes)
text_boxes = lmap(generate_random_text_block, text_boxes)
text_boxes = lmap(generate_recursive_random_table, text_boxes)
plots = lmap(generate_random_table, every_nth(2, char_boxes))
tables = lmap(generate_random_table, every_nth(2, char_boxes[1:]))
@ -411,6 +416,12 @@ def generate_random_table(rectangle: Rectangle) -> ContentRectangle:
return block
def generate_recursive_random_table(rectangle: Rectangle) -> ContentRectangle:
block = RecursiveRandomTable(*rectangle.coords)
block.generate_random_table()
return block
@lru_cache(maxsize=None)
def get_random_seed():
return random.randint(0, 2**32 - 1)
@ -423,6 +434,66 @@ class RandomContentRectangle(ContentRectangle):
self.random = random.Random(self.seed)
class RecursiveRandomTable(RandomContentRectangle):
def __init__(self, x1, y1, x2, y2, seed=None):
super().__init__(x1, y1, x2, y2, seed=seed)
self.n_columns = max(self.width // 100, 1)
self.n_rows = max(self.height // 100, 1)
self.cell_size = (self.width // self.n_columns, self.height // self.n_rows)
logger.debug(f"RecursiveRandomTable({self.n_columns}, {self.n_rows}, {self.cell_size}) at {self}")
self.content = Image.new("RGB", (self.width, self.height), (255, 255, 255))
def generate_random_table(self):
cells = list(self.generate_cells_with_content())
self.content = paste_contents(self.content, cells)
# convert to RGBA
self.content = self.content.convert("RGBA")
assert self.content.mode == "RGBA"
self.draw_cell_borders(cells)
def generate_cells_with_content(self):
for cell in self.generate_table():
cell.content = generate_random_text_block(cell, n_sentences=1).content
assert cell.content.mode == "RGBA"
yield cell
def draw_cell_borders(self, cells):
draw = ImageDraw.Draw(self.content)
for cell in cells:
draw.rectangle(cell.coords, outline=(0, 0, 0))
def draw_single_cell_borders(self, cell):
draw = ImageDraw.Draw(self.content)
draw.rectangle(cell.coords, outline=(1, 1, 0))
def generate_table(self) -> Iterable[ContentRectangle]:
yield from mapcat(self.generate_column, range(self.n_columns))
def generate_column(self, column_index) -> Iterable[ContentRectangle]:
logger.debug(f"Generating column {column_index}.")
generate_cell_content_for_row = partial(self.generate_cell, column_index=column_index)
yield from map(generate_cell_content_for_row, range(self.n_rows))
def generate_cell(self, row_index, column_index) -> ContentRectangle:
w, h = self.cell_size
x1, y1 = self.x1 + column_index * w, self.y1 + row_index * h
x2, y2 = x1 + w, y1 + h
logger.debug(f"Generating cell ({row_index}, {column_index}) at ({x1}, {y1}, {x2}, {y2}).")
assert x1 >= self.x1
assert y1 >= self.y1
assert x2 <= self.x2
assert y2 <= self.y2
return ContentRectangle(x1, y1, x2, y2)
def generate_column_names(self):
column_names = repeatedly(self.generate_column_name, self.n_columns)
return column_names
def generate_column_name(self):
column_name = Faker().words(random.randint(1, 3))
return column_name
class RandomTable(RandomContentRectangle):
def __init__(self, x1, y1, x2, y2, seed=None):
super().__init__(x1, y1, x2, y2, seed=seed)
@ -436,6 +507,7 @@ class RandomTable(RandomContentRectangle):
text_table = self.generate_random_ascii_table(rectangle)
table_lines = text_table.split("\n")
image = write_lines_to_image(table_lines, rectangle)
self.join_lines(image)
self.content = image
@ -459,15 +531,21 @@ class RandomTable(RandomContentRectangle):
return df
def join_lines(self, table: Image.Image):
table = normalize_image_format_to_array(table)
table = normalize_to_gray_scale(table)
grid = isolate_vertical_and_horizontal_components(table)
# grid = cv2.bitwise_not(grid)
def generate_random_ascii_table(self, rectangle: Rectangle):
df = self.generate_random_dataframe(rectangle)
table_format = random.choice(
[
"simple",
# "simple",
"grid",
"presto",
"psql",
"rst",
# "presto",
# "psql",
# "rst",
]
)
text_table = tabulate(df, headers="keys", tablefmt=table_format)
@ -506,7 +584,7 @@ class RandomFontPicker:
fonts = filter(self.font_is_renderable, fonts) # FIXME: this does not work
font = first(fonts)
loguru.logger.debug(f"Using font: {font}")
logger.debug(f"Using font: {font}")
return font
def shuffle_fonts(self):
@ -519,7 +597,7 @@ class RandomFontPicker:
@lru_cache(maxsize=None)
def load_font(self, font: str):
loguru.logger.trace(f"Loading font: {font}")
logger.trace(f"Loading font: {font}")
try:
return ImageFont.truetype(font, size=11)
except OSError:
@ -666,9 +744,9 @@ def maybe():
return random.random() > 0.9
def generate_random_text_block(rectangle: Rectangle) -> ContentRectangle:
def generate_random_text_block(rectangle: Rectangle, n_sentences=3000) -> ContentRectangle:
block = RandomTextBlock(*rectangle.coords)
block.generate_random_text(rectangle)
block.generate_random_text(rectangle, n_sentences)
return block
@ -697,10 +775,12 @@ class RandomTextBlock(ContentRectangle):
def __call__(self, *args, **kwargs):
pass
def generate_random_text(self, rectangle: Rectangle):
lines = generate_random_text_lines(rectangle, self.format_lines)
def generate_random_text(self, rectangle: Rectangle, n_sentences=3000):
lines = generate_random_text_lines(rectangle, self.format_lines, n_sentences)
image = write_lines_to_image(lines, rectangle, self.font)
self.content = image
assert self.content.mode == "RGBA"
return self
def format_lines(self, lines, last_full):
def truncate_current_line():
@ -731,8 +811,8 @@ class RandomTextBlock(ContentRectangle):
return line
def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity) -> List[str]:
text = Faker().paragraph(nb_sentences=3000, variable_nb_sentences=False, ext_word_list=None)
def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity, n_sentences=3000) -> List[str]:
text = Faker().paragraph(nb_sentences=n_sentences, variable_nb_sentences=False, ext_word_list=None)
unformatted_lines = textwrap.wrap(text, width=rectangle.width, break_long_words=False)
# each iteration of the line formatter function formats one more line and adds it to the back of the list
formatted_lines_generator = iterate(star(line_formatter), (unformatted_lines, True))