[WIP] recursive random table
This commit is contained in:
parent
4d11a157e5
commit
893622a73e
114
test/fixtures/page_generation/page.py
vendored
114
test/fixtures/page_generation/page.py
vendored
@ -1,26 +1,27 @@
|
|||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import string
|
import string
|
||||||
import textwrap
|
import textwrap
|
||||||
from functools import lru_cache
|
from functools import lru_cache, partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple, Union, Iterable, List
|
from typing import Tuple, Union, Iterable, List
|
||||||
|
|
||||||
import albumentations as A
|
import albumentations as A
|
||||||
import cv2 as cv
|
import cv2 as cv
|
||||||
import loguru
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytest
|
import pytest
|
||||||
from PIL import Image, ImageOps, ImageFont, ImageDraw
|
from PIL import Image, ImageOps, ImageFont, ImageDraw
|
||||||
from PIL.Image import Transpose
|
from PIL.Image import Transpose
|
||||||
from faker import Faker
|
from faker import Faker
|
||||||
|
from loguru import logger
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from cv_analysis.table_parsing import isolate_vertical_and_horizontal_components
|
||||||
from cv_analysis.utils import star, rconj
|
from cv_analysis.utils import star, rconj
|
||||||
|
from cv_analysis.utils.common import normalize_to_gray_scale
|
||||||
from cv_analysis.utils.merging import merge_related_rectangles
|
from cv_analysis.utils.merging import merge_related_rectangles
|
||||||
from cv_analysis.utils.postprocessing import remove_overlapping, remove_included
|
from cv_analysis.utils.postprocessing import remove_overlapping, remove_included
|
||||||
|
|
||||||
@ -97,7 +98,8 @@ from funcy import (
|
|||||||
lfilter,
|
lfilter,
|
||||||
lzip,
|
lzip,
|
||||||
keep,
|
keep,
|
||||||
lkeep,
|
repeatedly,
|
||||||
|
mapcat,
|
||||||
)
|
)
|
||||||
|
|
||||||
from cv_analysis.locations import TEST_PAGE_TEXTURES_DIR
|
from cv_analysis.locations import TEST_PAGE_TEXTURES_DIR
|
||||||
@ -365,6 +367,9 @@ class ContentRectangle(Rectangle):
|
|||||||
super().__init__(x1, y1, x2, y2)
|
super().__init__(x1, y1, x2, y2)
|
||||||
self.content = content
|
self.content = content
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({self.x1}, {self.y1}, {self.x2}, {self.y2}, content={self.content})"
|
||||||
|
|
||||||
|
|
||||||
class ContentGenerator:
|
class ContentGenerator:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -380,7 +385,7 @@ class ContentGenerator:
|
|||||||
char_boxes = lfilter(is_square_like, char_boxes)
|
char_boxes = lfilter(is_square_like, char_boxes)
|
||||||
text_boxes = merge_related_rectangles(text_boxes)
|
text_boxes = merge_related_rectangles(text_boxes)
|
||||||
|
|
||||||
text_boxes = lmap(generate_random_text_block, text_boxes)
|
text_boxes = lmap(generate_recursive_random_table, text_boxes)
|
||||||
plots = lmap(generate_random_table, every_nth(2, char_boxes))
|
plots = lmap(generate_random_table, every_nth(2, char_boxes))
|
||||||
tables = lmap(generate_random_table, every_nth(2, char_boxes[1:]))
|
tables = lmap(generate_random_table, every_nth(2, char_boxes[1:]))
|
||||||
|
|
||||||
@ -411,6 +416,12 @@ def generate_random_table(rectangle: Rectangle) -> ContentRectangle:
|
|||||||
return block
|
return block
|
||||||
|
|
||||||
|
|
||||||
|
def generate_recursive_random_table(rectangle: Rectangle) -> ContentRectangle:
|
||||||
|
block = RecursiveRandomTable(*rectangle.coords)
|
||||||
|
block.generate_random_table()
|
||||||
|
return block
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def get_random_seed():
|
def get_random_seed():
|
||||||
return random.randint(0, 2**32 - 1)
|
return random.randint(0, 2**32 - 1)
|
||||||
@ -423,6 +434,66 @@ class RandomContentRectangle(ContentRectangle):
|
|||||||
self.random = random.Random(self.seed)
|
self.random = random.Random(self.seed)
|
||||||
|
|
||||||
|
|
||||||
|
class RecursiveRandomTable(RandomContentRectangle):
|
||||||
|
def __init__(self, x1, y1, x2, y2, seed=None):
|
||||||
|
super().__init__(x1, y1, x2, y2, seed=seed)
|
||||||
|
self.n_columns = max(self.width // 100, 1)
|
||||||
|
self.n_rows = max(self.height // 100, 1)
|
||||||
|
self.cell_size = (self.width // self.n_columns, self.height // self.n_rows)
|
||||||
|
logger.debug(f"RecursiveRandomTable({self.n_columns}, {self.n_rows}, {self.cell_size}) at {self}")
|
||||||
|
self.content = Image.new("RGB", (self.width, self.height), (255, 255, 255))
|
||||||
|
|
||||||
|
def generate_random_table(self):
|
||||||
|
cells = list(self.generate_cells_with_content())
|
||||||
|
self.content = paste_contents(self.content, cells)
|
||||||
|
# convert to RGBA
|
||||||
|
self.content = self.content.convert("RGBA")
|
||||||
|
assert self.content.mode == "RGBA"
|
||||||
|
self.draw_cell_borders(cells)
|
||||||
|
|
||||||
|
def generate_cells_with_content(self):
|
||||||
|
for cell in self.generate_table():
|
||||||
|
cell.content = generate_random_text_block(cell, n_sentences=1).content
|
||||||
|
assert cell.content.mode == "RGBA"
|
||||||
|
yield cell
|
||||||
|
|
||||||
|
def draw_cell_borders(self, cells):
|
||||||
|
draw = ImageDraw.Draw(self.content)
|
||||||
|
for cell in cells:
|
||||||
|
draw.rectangle(cell.coords, outline=(0, 0, 0))
|
||||||
|
|
||||||
|
def draw_single_cell_borders(self, cell):
|
||||||
|
draw = ImageDraw.Draw(self.content)
|
||||||
|
draw.rectangle(cell.coords, outline=(1, 1, 0))
|
||||||
|
|
||||||
|
def generate_table(self) -> Iterable[ContentRectangle]:
|
||||||
|
yield from mapcat(self.generate_column, range(self.n_columns))
|
||||||
|
|
||||||
|
def generate_column(self, column_index) -> Iterable[ContentRectangle]:
|
||||||
|
logger.debug(f"Generating column {column_index}.")
|
||||||
|
generate_cell_content_for_row = partial(self.generate_cell, column_index=column_index)
|
||||||
|
yield from map(generate_cell_content_for_row, range(self.n_rows))
|
||||||
|
|
||||||
|
def generate_cell(self, row_index, column_index) -> ContentRectangle:
|
||||||
|
w, h = self.cell_size
|
||||||
|
x1, y1 = self.x1 + column_index * w, self.y1 + row_index * h
|
||||||
|
x2, y2 = x1 + w, y1 + h
|
||||||
|
logger.debug(f"Generating cell ({row_index}, {column_index}) at ({x1}, {y1}, {x2}, {y2}).")
|
||||||
|
assert x1 >= self.x1
|
||||||
|
assert y1 >= self.y1
|
||||||
|
assert x2 <= self.x2
|
||||||
|
assert y2 <= self.y2
|
||||||
|
return ContentRectangle(x1, y1, x2, y2)
|
||||||
|
|
||||||
|
def generate_column_names(self):
|
||||||
|
column_names = repeatedly(self.generate_column_name, self.n_columns)
|
||||||
|
return column_names
|
||||||
|
|
||||||
|
def generate_column_name(self):
|
||||||
|
column_name = Faker().words(random.randint(1, 3))
|
||||||
|
return column_name
|
||||||
|
|
||||||
|
|
||||||
class RandomTable(RandomContentRectangle):
|
class RandomTable(RandomContentRectangle):
|
||||||
def __init__(self, x1, y1, x2, y2, seed=None):
|
def __init__(self, x1, y1, x2, y2, seed=None):
|
||||||
super().__init__(x1, y1, x2, y2, seed=seed)
|
super().__init__(x1, y1, x2, y2, seed=seed)
|
||||||
@ -436,6 +507,7 @@ class RandomTable(RandomContentRectangle):
|
|||||||
text_table = self.generate_random_ascii_table(rectangle)
|
text_table = self.generate_random_ascii_table(rectangle)
|
||||||
table_lines = text_table.split("\n")
|
table_lines = text_table.split("\n")
|
||||||
image = write_lines_to_image(table_lines, rectangle)
|
image = write_lines_to_image(table_lines, rectangle)
|
||||||
|
self.join_lines(image)
|
||||||
|
|
||||||
self.content = image
|
self.content = image
|
||||||
|
|
||||||
@ -459,15 +531,21 @@ class RandomTable(RandomContentRectangle):
|
|||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
def join_lines(self, table: Image.Image):
|
||||||
|
table = normalize_image_format_to_array(table)
|
||||||
|
table = normalize_to_gray_scale(table)
|
||||||
|
grid = isolate_vertical_and_horizontal_components(table)
|
||||||
|
# grid = cv2.bitwise_not(grid)
|
||||||
|
|
||||||
def generate_random_ascii_table(self, rectangle: Rectangle):
|
def generate_random_ascii_table(self, rectangle: Rectangle):
|
||||||
df = self.generate_random_dataframe(rectangle)
|
df = self.generate_random_dataframe(rectangle)
|
||||||
table_format = random.choice(
|
table_format = random.choice(
|
||||||
[
|
[
|
||||||
"simple",
|
# "simple",
|
||||||
"grid",
|
"grid",
|
||||||
"presto",
|
# "presto",
|
||||||
"psql",
|
# "psql",
|
||||||
"rst",
|
# "rst",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
text_table = tabulate(df, headers="keys", tablefmt=table_format)
|
text_table = tabulate(df, headers="keys", tablefmt=table_format)
|
||||||
@ -506,7 +584,7 @@ class RandomFontPicker:
|
|||||||
fonts = filter(self.font_is_renderable, fonts) # FIXME: this does not work
|
fonts = filter(self.font_is_renderable, fonts) # FIXME: this does not work
|
||||||
|
|
||||||
font = first(fonts)
|
font = first(fonts)
|
||||||
loguru.logger.debug(f"Using font: {font}")
|
logger.debug(f"Using font: {font}")
|
||||||
return font
|
return font
|
||||||
|
|
||||||
def shuffle_fonts(self):
|
def shuffle_fonts(self):
|
||||||
@ -519,7 +597,7 @@ class RandomFontPicker:
|
|||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def load_font(self, font: str):
|
def load_font(self, font: str):
|
||||||
loguru.logger.trace(f"Loading font: {font}")
|
logger.trace(f"Loading font: {font}")
|
||||||
try:
|
try:
|
||||||
return ImageFont.truetype(font, size=11)
|
return ImageFont.truetype(font, size=11)
|
||||||
except OSError:
|
except OSError:
|
||||||
@ -666,9 +744,9 @@ def maybe():
|
|||||||
return random.random() > 0.9
|
return random.random() > 0.9
|
||||||
|
|
||||||
|
|
||||||
def generate_random_text_block(rectangle: Rectangle) -> ContentRectangle:
|
def generate_random_text_block(rectangle: Rectangle, n_sentences=3000) -> ContentRectangle:
|
||||||
block = RandomTextBlock(*rectangle.coords)
|
block = RandomTextBlock(*rectangle.coords)
|
||||||
block.generate_random_text(rectangle)
|
block.generate_random_text(rectangle, n_sentences)
|
||||||
return block
|
return block
|
||||||
|
|
||||||
|
|
||||||
@ -697,10 +775,12 @@ class RandomTextBlock(ContentRectangle):
|
|||||||
def __call__(self, *args, **kwargs):
|
def __call__(self, *args, **kwargs):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def generate_random_text(self, rectangle: Rectangle):
|
def generate_random_text(self, rectangle: Rectangle, n_sentences=3000):
|
||||||
lines = generate_random_text_lines(rectangle, self.format_lines)
|
lines = generate_random_text_lines(rectangle, self.format_lines, n_sentences)
|
||||||
image = write_lines_to_image(lines, rectangle, self.font)
|
image = write_lines_to_image(lines, rectangle, self.font)
|
||||||
self.content = image
|
self.content = image
|
||||||
|
assert self.content.mode == "RGBA"
|
||||||
|
return self
|
||||||
|
|
||||||
def format_lines(self, lines, last_full):
|
def format_lines(self, lines, last_full):
|
||||||
def truncate_current_line():
|
def truncate_current_line():
|
||||||
@ -731,8 +811,8 @@ class RandomTextBlock(ContentRectangle):
|
|||||||
return line
|
return line
|
||||||
|
|
||||||
|
|
||||||
def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity) -> List[str]:
|
def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity, n_sentences=3000) -> List[str]:
|
||||||
text = Faker().paragraph(nb_sentences=3000, variable_nb_sentences=False, ext_word_list=None)
|
text = Faker().paragraph(nb_sentences=n_sentences, variable_nb_sentences=False, ext_word_list=None)
|
||||||
unformatted_lines = textwrap.wrap(text, width=rectangle.width, break_long_words=False)
|
unformatted_lines = textwrap.wrap(text, width=rectangle.width, break_long_words=False)
|
||||||
# each iteration of the line formatter function formats one more line and adds it to the back of the list
|
# each iteration of the line formatter function formats one more line and adds it to the back of the list
|
||||||
formatted_lines_generator = iterate(star(line_formatter), (unformatted_lines, True))
|
formatted_lines_generator = iterate(star(line_formatter), (unformatted_lines, True))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user