diff --git a/test/fixtures/page_generation/page.py b/test/fixtures/page_generation/page.py index b2f2850..45d9107 100644 --- a/test/fixtures/page_generation/page.py +++ b/test/fixtures/page_generation/page.py @@ -1,26 +1,27 @@ import io import itertools -import os import random import string import textwrap -from functools import lru_cache +from functools import lru_cache, partial from pathlib import Path from typing import Tuple, Union, Iterable, List import albumentations as A import cv2 as cv -import loguru import numpy as np import pandas as pd import pytest from PIL import Image, ImageOps, ImageFont, ImageDraw from PIL.Image import Transpose from faker import Faker +from loguru import logger from matplotlib import pyplot as plt from tabulate import tabulate +from cv_analysis.table_parsing import isolate_vertical_and_horizontal_components from cv_analysis.utils import star, rconj +from cv_analysis.utils.common import normalize_to_gray_scale from cv_analysis.utils.merging import merge_related_rectangles from cv_analysis.utils.postprocessing import remove_overlapping, remove_included @@ -97,7 +98,8 @@ from funcy import ( lfilter, lzip, keep, - lkeep, + repeatedly, + mapcat, ) from cv_analysis.locations import TEST_PAGE_TEXTURES_DIR @@ -365,6 +367,9 @@ class ContentRectangle(Rectangle): super().__init__(x1, y1, x2, y2) self.content = content + def __repr__(self): + return f"{self.__class__.__name__}({self.x1}, {self.y1}, {self.x2}, {self.y2}, content={self.content})" + class ContentGenerator: def __init__(self): @@ -380,7 +385,7 @@ class ContentGenerator: char_boxes = lfilter(is_square_like, char_boxes) text_boxes = merge_related_rectangles(text_boxes) - text_boxes = lmap(generate_random_text_block, text_boxes) + text_boxes = lmap(generate_recursive_random_table, text_boxes) plots = lmap(generate_random_table, every_nth(2, char_boxes)) tables = lmap(generate_random_table, every_nth(2, char_boxes[1:])) @@ -411,6 +416,12 @@ def generate_random_table(rectangle: Rectangle) -> ContentRectangle: return block +def generate_recursive_random_table(rectangle: Rectangle) -> ContentRectangle: + block = RecursiveRandomTable(*rectangle.coords) + block.generate_random_table() + return block + + @lru_cache(maxsize=None) def get_random_seed(): return random.randint(0, 2**32 - 1) @@ -423,6 +434,66 @@ class RandomContentRectangle(ContentRectangle): self.random = random.Random(self.seed) +class RecursiveRandomTable(RandomContentRectangle): + def __init__(self, x1, y1, x2, y2, seed=None): + super().__init__(x1, y1, x2, y2, seed=seed) + self.n_columns = max(self.width // 100, 1) + self.n_rows = max(self.height // 100, 1) + self.cell_size = (self.width // self.n_columns, self.height // self.n_rows) + logger.debug(f"RecursiveRandomTable({self.n_columns}, {self.n_rows}, {self.cell_size}) at {self}") + self.content = Image.new("RGB", (self.width, self.height), (255, 255, 255)) + + def generate_random_table(self): + cells = list(self.generate_cells_with_content()) + self.content = paste_contents(self.content, cells) + # convert to RGBA + self.content = self.content.convert("RGBA") + assert self.content.mode == "RGBA" + self.draw_cell_borders(cells) + + def generate_cells_with_content(self): + for cell in self.generate_table(): + cell.content = generate_random_text_block(cell, n_sentences=1).content + assert cell.content.mode == "RGBA" + yield cell + + def draw_cell_borders(self, cells): + draw = ImageDraw.Draw(self.content) + for cell in cells: + draw.rectangle(cell.coords, outline=(0, 0, 0)) + + def draw_single_cell_borders(self, cell): + draw = ImageDraw.Draw(self.content) + draw.rectangle(cell.coords, outline=(1, 1, 0)) + + def generate_table(self) -> Iterable[ContentRectangle]: + yield from mapcat(self.generate_column, range(self.n_columns)) + + def generate_column(self, column_index) -> Iterable[ContentRectangle]: + logger.debug(f"Generating column {column_index}.") + generate_cell_content_for_row = partial(self.generate_cell, column_index=column_index) + yield from map(generate_cell_content_for_row, range(self.n_rows)) + + def generate_cell(self, row_index, column_index) -> ContentRectangle: + w, h = self.cell_size + x1, y1 = self.x1 + column_index * w, self.y1 + row_index * h + x2, y2 = x1 + w, y1 + h + logger.debug(f"Generating cell ({row_index}, {column_index}) at ({x1}, {y1}, {x2}, {y2}).") + assert x1 >= self.x1 + assert y1 >= self.y1 + assert x2 <= self.x2 + assert y2 <= self.y2 + return ContentRectangle(x1, y1, x2, y2) + + def generate_column_names(self): + column_names = repeatedly(self.generate_column_name, self.n_columns) + return column_names + + def generate_column_name(self): + column_name = Faker().words(random.randint(1, 3)) + return column_name + + class RandomTable(RandomContentRectangle): def __init__(self, x1, y1, x2, y2, seed=None): super().__init__(x1, y1, x2, y2, seed=seed) @@ -436,6 +507,7 @@ class RandomTable(RandomContentRectangle): text_table = self.generate_random_ascii_table(rectangle) table_lines = text_table.split("\n") image = write_lines_to_image(table_lines, rectangle) + self.join_lines(image) self.content = image @@ -459,15 +531,21 @@ class RandomTable(RandomContentRectangle): return df + def join_lines(self, table: Image.Image): + table = normalize_image_format_to_array(table) + table = normalize_to_gray_scale(table) + grid = isolate_vertical_and_horizontal_components(table) + # grid = cv2.bitwise_not(grid) + def generate_random_ascii_table(self, rectangle: Rectangle): df = self.generate_random_dataframe(rectangle) table_format = random.choice( [ - "simple", + # "simple", "grid", - "presto", - "psql", - "rst", + # "presto", + # "psql", + # "rst", ] ) text_table = tabulate(df, headers="keys", tablefmt=table_format) @@ -506,7 +584,7 @@ class RandomFontPicker: fonts = filter(self.font_is_renderable, fonts) # FIXME: this does not work font = first(fonts) - loguru.logger.debug(f"Using font: {font}") + logger.debug(f"Using font: {font}") return font def shuffle_fonts(self): @@ -519,7 +597,7 @@ class RandomFontPicker: @lru_cache(maxsize=None) def load_font(self, font: str): - loguru.logger.trace(f"Loading font: {font}") + logger.trace(f"Loading font: {font}") try: return ImageFont.truetype(font, size=11) except OSError: @@ -666,9 +744,9 @@ def maybe(): return random.random() > 0.9 -def generate_random_text_block(rectangle: Rectangle) -> ContentRectangle: +def generate_random_text_block(rectangle: Rectangle, n_sentences=3000) -> ContentRectangle: block = RandomTextBlock(*rectangle.coords) - block.generate_random_text(rectangle) + block.generate_random_text(rectangle, n_sentences) return block @@ -697,10 +775,12 @@ class RandomTextBlock(ContentRectangle): def __call__(self, *args, **kwargs): pass - def generate_random_text(self, rectangle: Rectangle): - lines = generate_random_text_lines(rectangle, self.format_lines) + def generate_random_text(self, rectangle: Rectangle, n_sentences=3000): + lines = generate_random_text_lines(rectangle, self.format_lines, n_sentences) image = write_lines_to_image(lines, rectangle, self.font) self.content = image + assert self.content.mode == "RGBA" + return self def format_lines(self, lines, last_full): def truncate_current_line(): @@ -731,8 +811,8 @@ class RandomTextBlock(ContentRectangle): return line -def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity) -> List[str]: - text = Faker().paragraph(nb_sentences=3000, variable_nb_sentences=False, ext_word_list=None) +def generate_random_text_lines(rectangle: Rectangle, line_formatter=identity, n_sentences=3000) -> List[str]: + text = Faker().paragraph(nb_sentences=n_sentences, variable_nb_sentences=False, ext_word_list=None) unformatted_lines = textwrap.wrap(text, width=rectangle.width, break_long_words=False) # each iteration of the line formatter function formats one more line and adds it to the back of the list formatted_lines_generator = iterate(star(line_formatter), (unformatted_lines, True))