Module fontai.config.preprocessing
Expand source code
from pathlib import Path
import logging
import typing as t
import inspect
import string
from argparse import Namespace
from pydantic import BaseModel, PositiveInt, PositiveFloat, validator
import strictyaml as yml
from fontai.config.core import BaseConfigHandler, BasePipelineTransformConfig
import fontai.io.records as records
logger = logging.getLogger(__name__)
class FontExtractionConfig(BaseModel):
"""
Data class that holds the runtime parameters to extract image arrays from files
Args:
charset (str): string containing characters to be extracted from font files
font_extraction_size (int): Font size to use when conveting fonts to images
canvas_size (int): Height and width of buffer array in which fonts will be extracted before being processed further
canvas_padding (int): Padding used in the canvas array when extracting the fonts
"""
charset: str
font_extraction_size: PositiveInt
canvas_size: PositiveInt
canvas_padding: PositiveInt
class Config(BasePipelineTransformConfig):
"""
Configuration class for the image extraction pipeline stage
Args:
output_record_class (records.TfrWritable): tfr-compatible output classes from the module `fontai.io.records`; currently supported are `LabeledChar` and `LabeledFont`
output_array_size (int): size of the final grayscale image corresponding to each font's characters
font_to_array_config (int): Data object with runtime parameters for exctracting image arrays from files
beam_cmd_line_args (t.List[str]): List of command line arguments passed to the Beam pipeline
"""
output_record_class: type
output_array_size: PositiveInt
max_output_file_size: PositiveFloat
font_to_array_config: FontExtractionConfig
beam_cmd_line_args: t.List[str]
@validator("output_record_class")
def validate_output_schema(schema_class):
"""Validate input record class
Args:
schema_class (type)
Returns:
type: Validated record class
Raises:
TypeError: If record class not in allowed set
"""
supported = [records.LabeledChar, records.LabeledFont]
if schema_class in supported:
return schema_class
else:
raise TypeError(f"supported output_record_classes are {[x.__name__ for x in supported]}")
class ConfigHandler(BaseConfigHandler):
"""
Wrapper for image processing stage's configuration handling logic
"""
@classmethod
def get_config_schema(self):
"""
YAML configuration schema:
output_record_class: name of record class that will populate output files; it has to inherit from `fontai.io.records.TfrWritable` and at the moment only `LabeledChar` and `LabeledFont` are supported. If `LabeledChar`, individual character images are preprocessed and stored in no particular order and saved as PNGs. If `LabeledFont`, characters from a single font are stored together and saved as tensors.
input_path: Input folder with zipped font files
output_path: output folder where output Tensorflow records files are stored
output_array_size: Size of square output images
max_output_file_size: maximum single-file size for output files in MB
font_extraction_size: Font size to use when extracting characters to images
canvas_size: initial image canvas size in which font chars are extracted
canvas_padding: padding for the initial image canvas to allow for unusally large or convoluted fonts
beam_cmd_line_args: list of Apache Beam's command line aeguments
"""
schema = yml.Map({
"output_record_class": yml.Str(),
"output_array_size": yml.Int(),
"font_extraction_size": yml.Int(),
"canvas_size": yml.Int(),
"canvas_padding": yml.Int(),
yml.Optional("input_path", default = None): self.IO_CONFIG_SCHEMA,
yml.Optional("output_path", default = None): self.IO_CONFIG_SCHEMA,
yml.Optional("max_output_file_size", default = 64.0): yml.Float(),
yml.Optional("beam_cmd_line_args", default = ["--runner", "DirectRunner"]): yml.Seq(yml.Str())
})
return schema
def instantiate_config(self, config: yml.YAML) -> Config:
"""
Processes a YAML instance to produce an Config instance.
Args:
config: YAML object from the strictyaml library
"""
input_path, output_path = config.get("input_path").text, config.get("output_path").text
output_record_class = getattr(records, config.get("output_record_class").text)
beam_cmd_line_args = config.data["beam_cmd_line_args"]
output_array_size = config.get("output_array_size").data
max_output_file_size = config.get("max_output_file_size").data
f2a_config = FontExtractionConfig(
charset = string.ascii_letters + string.digits,
font_extraction_size = config.get("font_extraction_size").data,
canvas_size = config.get("canvas_size").data,
canvas_padding = config.get("canvas_padding").data)
if f2a_config.canvas_padding >= f2a_config.canvas_size/2:
raise ValueError(f"canvas padding value ({f2a_config.canvas_padding}) is too large for canvas size ({f2a_config.canvas_size})")
logger.info(f"Setting output schema as {output_record_class.__name__}")
return Config(
output_record_class = output_record_class,
input_path = input_path,
output_path = output_path,
output_array_size = output_array_size,
max_output_file_size = max_output_file_size,
font_to_array_config = f2a_config,
beam_cmd_line_args = beam_cmd_line_args,
yaml = config)
Classes
class Config (**data: Any)
-
Configuration class for the image extraction pipeline stage
Args
output_record_class
:records.TfrWritable
- tfr-compatible output classes from the module
fontai.io.records
; currently supported areLabeledChar
andLabeledFont
output_array_size
:int
- size of the final grayscale image corresponding to each font's characters
font_to_array_config
:int
- Data object with runtime parameters for exctracting image arrays from files
beam_cmd_line_args
:t.List[str]
- List of command line arguments passed to the Beam pipeline
Create a new model by parsing and validating input data from keyword arguments.
Raises ValidationError if the input data cannot be parsed to form a valid model.
Expand source code
class Config(BasePipelineTransformConfig): """ Configuration class for the image extraction pipeline stage Args: output_record_class (records.TfrWritable): tfr-compatible output classes from the module `fontai.io.records`; currently supported are `LabeledChar` and `LabeledFont` output_array_size (int): size of the final grayscale image corresponding to each font's characters font_to_array_config (int): Data object with runtime parameters for exctracting image arrays from files beam_cmd_line_args (t.List[str]): List of command line arguments passed to the Beam pipeline """ output_record_class: type output_array_size: PositiveInt max_output_file_size: PositiveFloat font_to_array_config: FontExtractionConfig beam_cmd_line_args: t.List[str] @validator("output_record_class") def validate_output_schema(schema_class): """Validate input record class Args: schema_class (type) Returns: type: Validated record class Raises: TypeError: If record class not in allowed set """ supported = [records.LabeledChar, records.LabeledFont] if schema_class in supported: return schema_class else: raise TypeError(f"supported output_record_classes are {[x.__name__ for x in supported]}")
Ancestors
- BasePipelineTransformConfig
- pydantic.main.BaseModel
- pydantic.utils.Representation
Class variables
var beam_cmd_line_args : List[str]
var font_to_array_config : FontExtractionConfig
var max_output_file_size : pydantic.types.PositiveFloat
var output_array_size : pydantic.types.PositiveInt
var output_record_class : type
Static methods
def validate_output_schema()
-
Validate input record class
Args
schema_class (type)
Returns
type
- Validated record class
Raises
TypeError
- If record class not in allowed set
Expand source code
@validator("output_record_class") def validate_output_schema(schema_class): """Validate input record class Args: schema_class (type) Returns: type: Validated record class Raises: TypeError: If record class not in allowed set """ supported = [records.LabeledChar, records.LabeledFont] if schema_class in supported: return schema_class else: raise TypeError(f"supported output_record_classes are {[x.__name__ for x in supported]}")
class ConfigHandler
-
Wrapper for image processing stage's configuration handling logic
Expand source code
class ConfigHandler(BaseConfigHandler): """ Wrapper for image processing stage's configuration handling logic """ @classmethod def get_config_schema(self): """ YAML configuration schema: output_record_class: name of record class that will populate output files; it has to inherit from `fontai.io.records.TfrWritable` and at the moment only `LabeledChar` and `LabeledFont` are supported. If `LabeledChar`, individual character images are preprocessed and stored in no particular order and saved as PNGs. If `LabeledFont`, characters from a single font are stored together and saved as tensors. input_path: Input folder with zipped font files output_path: output folder where output Tensorflow records files are stored output_array_size: Size of square output images max_output_file_size: maximum single-file size for output files in MB font_extraction_size: Font size to use when extracting characters to images canvas_size: initial image canvas size in which font chars are extracted canvas_padding: padding for the initial image canvas to allow for unusally large or convoluted fonts beam_cmd_line_args: list of Apache Beam's command line aeguments """ schema = yml.Map({ "output_record_class": yml.Str(), "output_array_size": yml.Int(), "font_extraction_size": yml.Int(), "canvas_size": yml.Int(), "canvas_padding": yml.Int(), yml.Optional("input_path", default = None): self.IO_CONFIG_SCHEMA, yml.Optional("output_path", default = None): self.IO_CONFIG_SCHEMA, yml.Optional("max_output_file_size", default = 64.0): yml.Float(), yml.Optional("beam_cmd_line_args", default = ["--runner", "DirectRunner"]): yml.Seq(yml.Str()) }) return schema def instantiate_config(self, config: yml.YAML) -> Config: """ Processes a YAML instance to produce an Config instance. Args: config: YAML object from the strictyaml library """ input_path, output_path = config.get("input_path").text, config.get("output_path").text output_record_class = getattr(records, config.get("output_record_class").text) beam_cmd_line_args = config.data["beam_cmd_line_args"] output_array_size = config.get("output_array_size").data max_output_file_size = config.get("max_output_file_size").data f2a_config = FontExtractionConfig( charset = string.ascii_letters + string.digits, font_extraction_size = config.get("font_extraction_size").data, canvas_size = config.get("canvas_size").data, canvas_padding = config.get("canvas_padding").data) if f2a_config.canvas_padding >= f2a_config.canvas_size/2: raise ValueError(f"canvas padding value ({f2a_config.canvas_padding}) is too large for canvas size ({f2a_config.canvas_size})") logger.info(f"Setting output schema as {output_record_class.__name__}") return Config( output_record_class = output_record_class, input_path = input_path, output_path = output_path, output_array_size = output_array_size, max_output_file_size = max_output_file_size, font_to_array_config = f2a_config, beam_cmd_line_args = beam_cmd_line_args, yaml = config)
Ancestors
- BaseConfigHandler
- abc.ABC
Static methods
def get_config_schema()
-
YAML configuration schema:
output_record_class: name of record class that will populate output files; it has to inherit from
TfrWritable
and at the moment onlyLabeledChar
andLabeledFont
are supported. IfLabeledChar
, individual character images are preprocessed and stored in no particular order and saved as PNGs. IfLabeledFont
, characters from a single font are stored together and saved as tensors. input_path: Input folder with zipped font files output_path: output folder where output Tensorflow records files are stored output_array_size: Size of square output images max_output_file_size: maximum single-file size for output files in MB font_extraction_size: Font size to use when extracting characters to images canvas_size: initial image canvas size in which font chars are extracted canvas_padding: padding for the initial image canvas to allow for unusally large or convoluted fonts beam_cmd_line_args: list of Apache Beam's command line aegumentsExpand source code
@classmethod def get_config_schema(self): """ YAML configuration schema: output_record_class: name of record class that will populate output files; it has to inherit from `fontai.io.records.TfrWritable` and at the moment only `LabeledChar` and `LabeledFont` are supported. If `LabeledChar`, individual character images are preprocessed and stored in no particular order and saved as PNGs. If `LabeledFont`, characters from a single font are stored together and saved as tensors. input_path: Input folder with zipped font files output_path: output folder where output Tensorflow records files are stored output_array_size: Size of square output images max_output_file_size: maximum single-file size for output files in MB font_extraction_size: Font size to use when extracting characters to images canvas_size: initial image canvas size in which font chars are extracted canvas_padding: padding for the initial image canvas to allow for unusally large or convoluted fonts beam_cmd_line_args: list of Apache Beam's command line aeguments """ schema = yml.Map({ "output_record_class": yml.Str(), "output_array_size": yml.Int(), "font_extraction_size": yml.Int(), "canvas_size": yml.Int(), "canvas_padding": yml.Int(), yml.Optional("input_path", default = None): self.IO_CONFIG_SCHEMA, yml.Optional("output_path", default = None): self.IO_CONFIG_SCHEMA, yml.Optional("max_output_file_size", default = 64.0): yml.Float(), yml.Optional("beam_cmd_line_args", default = ["--runner", "DirectRunner"]): yml.Seq(yml.Str()) }) return schema
Methods
def instantiate_config(self, config: strictyaml.representation.YAML) ‑> Config
-
Processes a YAML instance to produce an Config instance.
Args
config
- YAML object from the strictyaml library
Expand source code
def instantiate_config(self, config: yml.YAML) -> Config: """ Processes a YAML instance to produce an Config instance. Args: config: YAML object from the strictyaml library """ input_path, output_path = config.get("input_path").text, config.get("output_path").text output_record_class = getattr(records, config.get("output_record_class").text) beam_cmd_line_args = config.data["beam_cmd_line_args"] output_array_size = config.get("output_array_size").data max_output_file_size = config.get("max_output_file_size").data f2a_config = FontExtractionConfig( charset = string.ascii_letters + string.digits, font_extraction_size = config.get("font_extraction_size").data, canvas_size = config.get("canvas_size").data, canvas_padding = config.get("canvas_padding").data) if f2a_config.canvas_padding >= f2a_config.canvas_size/2: raise ValueError(f"canvas padding value ({f2a_config.canvas_padding}) is too large for canvas size ({f2a_config.canvas_size})") logger.info(f"Setting output schema as {output_record_class.__name__}") return Config( output_record_class = output_record_class, input_path = input_path, output_path = output_path, output_array_size = output_array_size, max_output_file_size = max_output_file_size, font_to_array_config = f2a_config, beam_cmd_line_args = beam_cmd_line_args, yaml = config)
Inherited members
class FontExtractionConfig (**data: Any)
-
Data class that holds the runtime parameters to extract image arrays from files
Args
charset
:str
- string containing characters to be extracted from font files
font_extraction_size
:int
- Font size to use when conveting fonts to images
canvas_size
:int
- Height and width of buffer array in which fonts will be extracted before being processed further
canvas_padding
:int
- Padding used in the canvas array when extracting the fonts
Create a new model by parsing and validating input data from keyword arguments.
Raises ValidationError if the input data cannot be parsed to form a valid model.
Expand source code
class FontExtractionConfig(BaseModel): """ Data class that holds the runtime parameters to extract image arrays from files Args: charset (str): string containing characters to be extracted from font files font_extraction_size (int): Font size to use when conveting fonts to images canvas_size (int): Height and width of buffer array in which fonts will be extracted before being processed further canvas_padding (int): Padding used in the canvas array when extracting the fonts """ charset: str font_extraction_size: PositiveInt canvas_size: PositiveInt canvas_padding: PositiveInt
Ancestors
- pydantic.main.BaseModel
- pydantic.utils.Representation
Class variables
var canvas_padding : pydantic.types.PositiveInt
var canvas_size : pydantic.types.PositiveInt
var charset : str
var font_extraction_size : pydantic.types.PositiveInt