Module fontai.io.records

This module contains classes that can be serialised/deserialised to/from Tensorflow record files; they are used by the prediction stage for both training and scoring

Expand source code
"""This module contains classes that can be serialised/deserialised to/from Tensorflow record files; they are used by the prediction stage for both training and scoring

"""
from __future__ import annotations
import zipfile
import typing as t
from typing import TypeVar, SupportsAbs, Generic
import io
import logging
from abc import ABC, abstractmethod

from collections import OrderedDict

from pydantic import BaseModel

from numpy import ndarray, uint8, all as np_all
import imageio

from tensorflow import string as tf_str, Tensor, executing_eagerly, convert_to_tensor
from tensorflow.train import (Example as TFExample, Feature as TFFeature, Features as TFFeatures, BytesList as TFBytesList, FloatList as TFFloatList)
from tensorflow.io import FixedLenFeature, parse_single_example, serialize_tensor

import tensorflow as tf
from tensorflow.data import TFRecordDataset



logger = logging.getLogger(__name__)

class TfrWritable(ABC):

  """Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package
  """
  
  _tfr_schema: t.Dict

  _nonbatched_scoring: bool #if True, batch size is ignored at scoring time for this record type.

  @classmethod
  def tensor_to_numpy(cls, x: Tensor) -> ndarray:
    """Converts Tensor to numpy array
    
    Args:
        x (Tensor): Input tensor
    
    Returns:
        ndarray: numpy array
    """

    if executing_eagerly():
      return x.numpy()
    else:
      return x.eval()

  @classmethod
  def array_to_bytes(cls, x: t.Union[Tensor, ndarray], dtype: type) -> bytes:
    """Converts an array, either from numpy or Tensorflow, to a stream of bytes to be serialized
    
    Args:
        x (t.Union[Tensor, ndarray]): Input array
        dtype: type of returned tensor
    
    Returns:
        bytes: serialized array
    """

    serialised_tensor = serialize_tensor(convert_to_tensor(x, dtype=dtype))

    byte_content = cls.tensor_to_numpy(serialised_tensor)

    return byte_content

  @classmethod
  def bytes_feature(cls, value: bytes) -> TFFeature:
    """Maps a bytestream to a TF Feature instance
    
    Args:
        value (bytes): bytes to encode
    
    Returns:
        TFFeature: encoded value
    """
    return TFFeature(bytes_list=TFBytesList(value=[value]))


  @classmethod
  @abstractmethod
  def to_bytes_dict(self) -> TFFeature:
    """Maps an object inheriting from this class to a TF record compatible format
    
    Returns:
        t.Dict: dictionary with encoded features that will be stored into a TF record.
    """
    pass

  def to_tf_example(self):
    """Returns a Tensorflow example instance encoding the instance's contents
    
    """

    return TFExample(
      features = TFFeatures(feature = self.to_bytes_dict()))

  @classmethod
  def from_tf_example(cls, example: Tensor) -> t.Dict:
    """Creates an instance by deserialising a TF record using the class schema
    
    Args:
        example (TFExample): example TF example
    
    Returns:
        TfrWritable: deserialised TfrWritable instance
    """
    return parse_single_example(example,cls._tfr_schema)

  @classmethod
  def img_to_png_bytes(cls, img):
    bf = io.BytesIO()
    imageio.imwrite(bf,img.astype(uint8),"png")
    val = bf.getvalue()
    bf.close()
    return val

  def add_score(self, score: Tensor, charset_tensor: Tensor) -> TfrWritable:
    """Adds a model's score and return the appropriate record instance
    
    Args:
        score (Tensor): Model score

        charset (Tensor): charset used by the scoring model
    
    Returns:

        TfrWritable: scored record instance
    """
    return NotImplementError("Adding a score is not implemented for this schema.")

  @classmethod
  @abstractmethod
  def parse_bytes_dict(self, record):
    """Performs basic parsing of deserialised features and returns dict with the same keys as the tfr schema's ordered dict
    
    Args:
        record (tf.train.TFExample): Input record
    
    Returns:
        t.Dict: Output dictionary
    """
    pass


  @classmethod
  @abstractmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:
    """Returns a function that maps partially parsed objects as outputted by parse_bytes_dict to a (features, label) tuple for training consumption
    
    Args:
        charset_tensor (Tensor): tensor fo valid characters
    
    Returns:
        t.Callable: Parser function
    """
    pass

  @classmethod
  def from_parsed_bytes_dict(cls, kwargs: t.Dict):
    """Instantiate from a parsed bytes dict extracted from a Tensorflow record file
    
    Args:
        kwargs (t.Dict): Parsed dictionary
    
    Returns:
        TfrWritable
    """
    return cls(**{key: kwargs[key].numpy() for key in kwargs})

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    label: ndarray,
    fontname: t.Union[str, ndarray],
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[TfrWritable, None, None]:
    """Maps a batch of scored features and associated objects to a generator of TfrWritable instances. This method is necessary because labeled chars and labeled fonts differ in shape, and logic for mapping scored batches to records is different for each of them.
    
    Args:
        features (ndarray): batch features; they must be preprocessed for scoring, which usually means they are in unit scale and are of type float32.
        label (ndarray): batch labels
        fontname (t.Union[str, ndarray]): batch fontnames
        scores (ndarray): batch scores
        charset_tensor (ndarray): tensor with a single char element per charset element
    
    Returns:
        t.Generator[TfrWritable, None, None]: Generator of formatted records
    
    """
    return NotImplementError("This method is only implemented for subclasses")

  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):
    """This function is needed because filtering by character requires different logic for individual char images and for entire fonts.
    
    Args:
        dataset (TFRecordDataset): input dataset
        charset_tensor (ndarray): tensor with a single char element per charset element
    """

    return NotImplementError("This method is only implemented for subclasses")






class ModelWithAnyType(BaseModel):

    # internal BaseModel configuration class
  class Config:
    arbitrary_types_allowed = True


class LabeledChar(TfrWritable, ModelWithAnyType):
  # wrapper that holds a labeled ML example, with asociated metadata
  features: ndarray
  label: str
  fontname: str


  _tfr_schema = OrderedDict([
    ('features', FixedLenFeature([], tf_str)),
    ('label', FixedLenFeature([], tf_str)),
    ('fontname', FixedLenFeature([], tf_str))])

  _nonbatched_scoring = False

  # def __init__(self, **data):

  #   filtered_data = data.pop("_tfr_schema")
  #   super().__init__(**filtered_data)


  def __iter__(self):
    return iter((self.features,self.label,self.fontname))

  def __eq__(self,other):
    return isinstance(other, LabeledChar) and np_all(self.features == other.features) and self.label == other.label and self.fontname == other.fontname

  def to_bytes_dict(self) -> t.Dict:
    return {
    "label": self.bytes_feature(bytes(str.encode(self.label))),
    "fontname": self.bytes_feature(bytes(str.encode(self.fontname))),
    "features": self.bytes_feature(self.img_to_png_bytes(self.features))
    }

  def add_score(self, score: ndarray, charset_tensor: ndarray) -> TfrWritable:

    return ScoredLabeledChar(example = self, score = score, charset_tensor = charset_tensor)


  @classmethod
  def parse_bytes_dict(cls, record):

    img = tf.image.decode_png(record["features"])
    img = tf.cast(img,dtype=tf.float32)/255.0 #rescaled image data

    record["features"] = img
    return record


  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    def parser(kwargs):

      num_classes = len(charset_tensor)

      one_hot_label = tf.cast(tf.where(charset_tensor == kwargs["label"]),dtype=tf.int32)
      if tf.equal(tf.size(one_hot_label),0):
        label = tf.cast(one_hot_label, dtype=tf.float32) #if label not in current charset, pass empty label for downstream deletion
      else:
        label = tf.reshape(tf.one_hot(indices=one_hot_label,depth=num_classes),(num_classes,))
      
      #return kwargs["features"], label
      #kwargs["label"] = label
      #return kwargs
      return kwargs["features"], label

    return parser

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    try:
      batch_size, height, width, channels = features.shape
    except ValueError as e:
      raise ValueError("Features should have 4 dimensions, including batch and channels")

    for k in range(batch_size):
      yield cls(
        features = (255 * features[k].reshape((height, width))).astype(uint8),
        label = labels[k],
        fontname = fontnames[k]
        ).add_score(
        score = scores[k],
        charset_tensor = charset_tensor)

  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):

    def filter_func(kwargs):
      idx = tf.where(charset_tensor == kwargs["label"])
      return tf.math.logical_not(tf.equal(tf.size(idx), 0))

    return dataset.filter(filter_func)
  



class LabeledFont(TfrWritable, ModelWithAnyType):
  # wrapper that holds an entire font's character set
  features: ndarray
  label: ndarray
  fontname: str

  _tfr_schema = OrderedDict([
    ('features', FixedLenFeature([], tf_str)),
    ('label', FixedLenFeature([], tf_str)),
    ('fontname', FixedLenFeature([], tf_str))])

  _nonbatched_scoring = True
  # def __init__(self, **data):

  #   filtered_data = data.pop("_tfr_schema")
  #   super().__init__(**filtered_data)


  def __iter__(self):
    n = len(self.label)
    return (LabeledChar(
      features = self.features[k], 
      label = self.label[k], 
      fontname=self.fontname) for k in range(n))

  def __eq__(self,other):
    return isinstance(other, LabeledFont) and np_all(self.features == other.features) and np_all(self.label == other.label) and self.fontname == other.fontname


  def to_bytes_dict(self) -> t.Dict:

    feature_shape = self.features.shape

    # add channel dimension to feature
    return {
    "features": self.bytes_feature(self.array_to_bytes(self.features.reshape(feature_shape + (1,)), dtype=tf.uint8)),
    "label": self.bytes_feature(self.array_to_bytes(self.label, dtype=tf.string)),
    "fontname": self.bytes_feature(bytes(str.encode(self.fontname))),
    }

  def add_score(self, score: ndarray, charset_tensor: ndarray) -> TfrWritable:

    return ScoredLabeledFont(example = self, score = score, charset_tensor = charset_tensor)

  @classmethod
  def parse_bytes_dict(cls, record):
    imgs = tf.io.parse_tensor(record["features"], out_type=tf.uint8)
    imgs = tf.cast(imgs,dtype=tf.float32)/255.0 #rescaled image data
    label = tf.io.parse_tensor(record["label"], out_type=tf.string)

    record["features"] = imgs
    record["label"] = label
    return record

  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    def parser(kwargs: t.Dict):

      #if label is empty, pass empty for downstream deletion
      if tf.equal(tf.size(kwargs["label"]), 0):
        return kwargs["features"], tf.zeros((0,),dtype=tf.float32)

      num_classes = len(charset_tensor)

      raw_one_hot = tf.cast(
        tf.reshape(kwargs["label"], (-1,1)) == charset_tensor,
        dtype=tf.int32
      ) #one hot encoding with up to 62 columns

      index = tf.reduce_sum(raw_one_hot, axis=-1) > 0 # detect rows where all columns are zero (labels not in current charset)

      if tf.equal(tf.reduce_sum(tf.cast(index, dtype=tf.int32)), 0):
        features = kwargs["features"]
        label = tf.zeros((0,),dtype=tf.float32) #if no labels are in current charset, pass empty label for downstream deletion
      else:
        one_hot_label = tf.argmax(raw_one_hot[index]) # filter chars not in charset
        label = tf.reshape(tf.one_hot(indices=one_hot_label,depth=num_classes),(num_classes,-1)) #create restricted one hot encoding
        features = kwargs["features"][index]


      return features, label
      # kwargs["label"] = label
      # kwargs["features"] = features
      # return kwargs


    return parser

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    try:
      font_size, height, width, channels = features.shape
    except ValueError as e:
      raise ValueError("Features should have 4 dimensions, including batch and channels; make sure that batch size parameter in RecordProcessor.fetch is null for font records)")

    yield cls(
      features = (255 * features.reshape((font_size, height, width))).astype(uint8),
      label = labels,
      fontname = fontnames
      ).add_score(
      score = scores,
      charset_tensor = charset_tensor)


  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):

    def filter_func(kwargs):
      reshaped_labels = tf.reshape(kwargs["label"], (-1,1))
      in_charset = tf.reduce_sum(tf.cast(reshaped_labels == charset_tensor, tf.int32), axis=-1)
      index = in_charset > 0
      kwargs["features"] = kwargs["features"][index]
      kwargs["label"] = kwargs["label"][index]

      return kwargs

    return dataset.map(filter_func)

class ScoredRecordFactory(object):

  """Creates classes for scored TfrWritable records
  """
  
  @classmethod
  def create(cls, T: type):
    """Create a scored record's class
    
    Args:
        T (type): Subclass of TfrWritable
    
    Returns:
        TfrWritable: scored record class
    
    Raises:
        TypeError
    """
    if not issubclass(T, TfrWritable):
      raise TypeError("T must be a subclass of TfrWritable")
    else:
      class ScoredRecord(TfrWritable):
        #

        record_type = T

        _tfr_schema = {
          **record_type._tfr_schema, 
          **{'charset_tensor': FixedLenFeature([], tf_str),'score': FixedLenFeature([], tf_str)}
        }

        _nonbatched_scoring = T._nonbatched_scoring

        def __init__(self, example: TfrWritable, score: ndarray, charset_tensor: ndarray):
          if not isinstance(example, T):
            raise TypeError(f"example must be an instance of {T}")
          elif not isinstance(score, ndarray):
            raise TypeError(f"score must be an instance of ndarray")
          elif not isinstance(charset_tensor, ndarray):
            raise TypeError(f"charset_tensor must be an instance of {ndarray}; found {charset_tensor.__class__}")
          # elif len(charset_tensor) != len(score):
          #   raise ValueError("charset_tensor must be the same length as score")

          self.example = example
          self.score = score
          self.charset_tensor = charset_tensor


        def __eq__(self,other):
          return self.example == other.example and np_all(self.score == other.score) and np_all(self.charset_tensor == other.charset_tensor)
        #
        def to_bytes_dict(self) -> t.Dict:
          #
          return {
          "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
          "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
          **self.example.to_bytes_dict()
          }
        #
        @classmethod
        def parse_bytes_dict(cls, record):
          parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

          score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
          parsed_record_bytes_dict["score"] = score

          charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
          parsed_record_bytes_dict["charset_tensor"] = charset_tensor

          return parsed_record_bytes_dict 

        @classmethod
        def from_parsed_bytes_dict(cls, kwargs: t.Dict):

          kwargs = {key: kwargs[key].numpy() for key in kwargs}
          score = kwargs.pop("score")
          charset_tensor = kwargs.pop("charset_tensor")

          return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)

        @classmethod
        def get_training_parser(
          cls, 
          charset_tensor: Tensor) -> t.Callable:

          return cls.record_type.get_training_parser(charset_tensor=charset_tensor)

        @classmethod
        def from_scored_batch(
          cls,
          features: ndarray,
          labels: ndarray,
          fontnames: ndarray,
          scores: ndarray,
          charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

          return cls.record_type.from_scored_batch(
            features,
            labels,
            fontnames,
            scores,
            charset_tensor)

        @classmethod
        def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

          return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)
        
      return ScoredRecord


ScoredLabeledChar = ScoredRecordFactory.create(LabeledChar)

ScoredLabeledFont = ScoredRecordFactory.create(LabeledFont)

Classes

class LabeledChar (**data: Any)

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.

Expand source code
class LabeledChar(TfrWritable, ModelWithAnyType):
  # wrapper that holds a labeled ML example, with asociated metadata
  features: ndarray
  label: str
  fontname: str


  _tfr_schema = OrderedDict([
    ('features', FixedLenFeature([], tf_str)),
    ('label', FixedLenFeature([], tf_str)),
    ('fontname', FixedLenFeature([], tf_str))])

  _nonbatched_scoring = False

  # def __init__(self, **data):

  #   filtered_data = data.pop("_tfr_schema")
  #   super().__init__(**filtered_data)


  def __iter__(self):
    return iter((self.features,self.label,self.fontname))

  def __eq__(self,other):
    return isinstance(other, LabeledChar) and np_all(self.features == other.features) and self.label == other.label and self.fontname == other.fontname

  def to_bytes_dict(self) -> t.Dict:
    return {
    "label": self.bytes_feature(bytes(str.encode(self.label))),
    "fontname": self.bytes_feature(bytes(str.encode(self.fontname))),
    "features": self.bytes_feature(self.img_to_png_bytes(self.features))
    }

  def add_score(self, score: ndarray, charset_tensor: ndarray) -> TfrWritable:

    return ScoredLabeledChar(example = self, score = score, charset_tensor = charset_tensor)


  @classmethod
  def parse_bytes_dict(cls, record):

    img = tf.image.decode_png(record["features"])
    img = tf.cast(img,dtype=tf.float32)/255.0 #rescaled image data

    record["features"] = img
    return record


  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    def parser(kwargs):

      num_classes = len(charset_tensor)

      one_hot_label = tf.cast(tf.where(charset_tensor == kwargs["label"]),dtype=tf.int32)
      if tf.equal(tf.size(one_hot_label),0):
        label = tf.cast(one_hot_label, dtype=tf.float32) #if label not in current charset, pass empty label for downstream deletion
      else:
        label = tf.reshape(tf.one_hot(indices=one_hot_label,depth=num_classes),(num_classes,))
      
      #return kwargs["features"], label
      #kwargs["label"] = label
      #return kwargs
      return kwargs["features"], label

    return parser

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    try:
      batch_size, height, width, channels = features.shape
    except ValueError as e:
      raise ValueError("Features should have 4 dimensions, including batch and channels")

    for k in range(batch_size):
      yield cls(
        features = (255 * features[k].reshape((height, width))).astype(uint8),
        label = labels[k],
        fontname = fontnames[k]
        ).add_score(
        score = scores[k],
        charset_tensor = charset_tensor)

  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):

    def filter_func(kwargs):
      idx = tf.where(charset_tensor == kwargs["label"])
      return tf.math.logical_not(tf.equal(tf.size(idx), 0))

    return dataset.filter(filter_func)

Ancestors

Class variables

var features : numpy.ndarray
var fontname : str
var label : str

Inherited members

class LabeledFont (**data: Any)

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.

Expand source code
class LabeledFont(TfrWritable, ModelWithAnyType):
  # wrapper that holds an entire font's character set
  features: ndarray
  label: ndarray
  fontname: str

  _tfr_schema = OrderedDict([
    ('features', FixedLenFeature([], tf_str)),
    ('label', FixedLenFeature([], tf_str)),
    ('fontname', FixedLenFeature([], tf_str))])

  _nonbatched_scoring = True
  # def __init__(self, **data):

  #   filtered_data = data.pop("_tfr_schema")
  #   super().__init__(**filtered_data)


  def __iter__(self):
    n = len(self.label)
    return (LabeledChar(
      features = self.features[k], 
      label = self.label[k], 
      fontname=self.fontname) for k in range(n))

  def __eq__(self,other):
    return isinstance(other, LabeledFont) and np_all(self.features == other.features) and np_all(self.label == other.label) and self.fontname == other.fontname


  def to_bytes_dict(self) -> t.Dict:

    feature_shape = self.features.shape

    # add channel dimension to feature
    return {
    "features": self.bytes_feature(self.array_to_bytes(self.features.reshape(feature_shape + (1,)), dtype=tf.uint8)),
    "label": self.bytes_feature(self.array_to_bytes(self.label, dtype=tf.string)),
    "fontname": self.bytes_feature(bytes(str.encode(self.fontname))),
    }

  def add_score(self, score: ndarray, charset_tensor: ndarray) -> TfrWritable:

    return ScoredLabeledFont(example = self, score = score, charset_tensor = charset_tensor)

  @classmethod
  def parse_bytes_dict(cls, record):
    imgs = tf.io.parse_tensor(record["features"], out_type=tf.uint8)
    imgs = tf.cast(imgs,dtype=tf.float32)/255.0 #rescaled image data
    label = tf.io.parse_tensor(record["label"], out_type=tf.string)

    record["features"] = imgs
    record["label"] = label
    return record

  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    def parser(kwargs: t.Dict):

      #if label is empty, pass empty for downstream deletion
      if tf.equal(tf.size(kwargs["label"]), 0):
        return kwargs["features"], tf.zeros((0,),dtype=tf.float32)

      num_classes = len(charset_tensor)

      raw_one_hot = tf.cast(
        tf.reshape(kwargs["label"], (-1,1)) == charset_tensor,
        dtype=tf.int32
      ) #one hot encoding with up to 62 columns

      index = tf.reduce_sum(raw_one_hot, axis=-1) > 0 # detect rows where all columns are zero (labels not in current charset)

      if tf.equal(tf.reduce_sum(tf.cast(index, dtype=tf.int32)), 0):
        features = kwargs["features"]
        label = tf.zeros((0,),dtype=tf.float32) #if no labels are in current charset, pass empty label for downstream deletion
      else:
        one_hot_label = tf.argmax(raw_one_hot[index]) # filter chars not in charset
        label = tf.reshape(tf.one_hot(indices=one_hot_label,depth=num_classes),(num_classes,-1)) #create restricted one hot encoding
        features = kwargs["features"][index]


      return features, label
      # kwargs["label"] = label
      # kwargs["features"] = features
      # return kwargs


    return parser

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    try:
      font_size, height, width, channels = features.shape
    except ValueError as e:
      raise ValueError("Features should have 4 dimensions, including batch and channels; make sure that batch size parameter in RecordProcessor.fetch is null for font records)")

    yield cls(
      features = (255 * features.reshape((font_size, height, width))).astype(uint8),
      label = labels,
      fontname = fontnames
      ).add_score(
      score = scores,
      charset_tensor = charset_tensor)


  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):

    def filter_func(kwargs):
      reshaped_labels = tf.reshape(kwargs["label"], (-1,1))
      in_charset = tf.reduce_sum(tf.cast(reshaped_labels == charset_tensor, tf.int32), axis=-1)
      index = in_charset > 0
      kwargs["features"] = kwargs["features"][index]
      kwargs["label"] = kwargs["label"][index]

      return kwargs

    return dataset.map(filter_func)

Ancestors

Class variables

var features : numpy.ndarray
var fontname : str
var label : numpy.ndarray

Inherited members

class ModelWithAnyType (**data: Any)

Create a new model by parsing and validating input data from keyword arguments.

Raises ValidationError if the input data cannot be parsed to form a valid model.

Expand source code
class ModelWithAnyType(BaseModel):

    # internal BaseModel configuration class
  class Config:
    arbitrary_types_allowed = True

Ancestors

  • pydantic.main.BaseModel
  • pydantic.utils.Representation

Subclasses

Class variables

var Config
class ScoredRecordFactory

Creates classes for scored TfrWritable records

Expand source code
class ScoredRecordFactory(object):

  """Creates classes for scored TfrWritable records
  """
  
  @classmethod
  def create(cls, T: type):
    """Create a scored record's class
    
    Args:
        T (type): Subclass of TfrWritable
    
    Returns:
        TfrWritable: scored record class
    
    Raises:
        TypeError
    """
    if not issubclass(T, TfrWritable):
      raise TypeError("T must be a subclass of TfrWritable")
    else:
      class ScoredRecord(TfrWritable):
        #

        record_type = T

        _tfr_schema = {
          **record_type._tfr_schema, 
          **{'charset_tensor': FixedLenFeature([], tf_str),'score': FixedLenFeature([], tf_str)}
        }

        _nonbatched_scoring = T._nonbatched_scoring

        def __init__(self, example: TfrWritable, score: ndarray, charset_tensor: ndarray):
          if not isinstance(example, T):
            raise TypeError(f"example must be an instance of {T}")
          elif not isinstance(score, ndarray):
            raise TypeError(f"score must be an instance of ndarray")
          elif not isinstance(charset_tensor, ndarray):
            raise TypeError(f"charset_tensor must be an instance of {ndarray}; found {charset_tensor.__class__}")
          # elif len(charset_tensor) != len(score):
          #   raise ValueError("charset_tensor must be the same length as score")

          self.example = example
          self.score = score
          self.charset_tensor = charset_tensor


        def __eq__(self,other):
          return self.example == other.example and np_all(self.score == other.score) and np_all(self.charset_tensor == other.charset_tensor)
        #
        def to_bytes_dict(self) -> t.Dict:
          #
          return {
          "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
          "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
          **self.example.to_bytes_dict()
          }
        #
        @classmethod
        def parse_bytes_dict(cls, record):
          parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

          score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
          parsed_record_bytes_dict["score"] = score

          charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
          parsed_record_bytes_dict["charset_tensor"] = charset_tensor

          return parsed_record_bytes_dict 

        @classmethod
        def from_parsed_bytes_dict(cls, kwargs: t.Dict):

          kwargs = {key: kwargs[key].numpy() for key in kwargs}
          score = kwargs.pop("score")
          charset_tensor = kwargs.pop("charset_tensor")

          return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)

        @classmethod
        def get_training_parser(
          cls, 
          charset_tensor: Tensor) -> t.Callable:

          return cls.record_type.get_training_parser(charset_tensor=charset_tensor)

        @classmethod
        def from_scored_batch(
          cls,
          features: ndarray,
          labels: ndarray,
          fontnames: ndarray,
          scores: ndarray,
          charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

          return cls.record_type.from_scored_batch(
            features,
            labels,
            fontnames,
            scores,
            charset_tensor)

        @classmethod
        def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

          return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)
        
      return ScoredRecord

Static methods

def create(T: type)

Create a scored record's class

Args

T : type
Subclass of TfrWritable

Returns

TfrWritable
scored record class

Raises

TypeError

Expand source code
@classmethod
def create(cls, T: type):
  """Create a scored record's class
  
  Args:
      T (type): Subclass of TfrWritable
  
  Returns:
      TfrWritable: scored record class
  
  Raises:
      TypeError
  """
  if not issubclass(T, TfrWritable):
    raise TypeError("T must be a subclass of TfrWritable")
  else:
    class ScoredRecord(TfrWritable):
      #

      record_type = T

      _tfr_schema = {
        **record_type._tfr_schema, 
        **{'charset_tensor': FixedLenFeature([], tf_str),'score': FixedLenFeature([], tf_str)}
      }

      _nonbatched_scoring = T._nonbatched_scoring

      def __init__(self, example: TfrWritable, score: ndarray, charset_tensor: ndarray):
        if not isinstance(example, T):
          raise TypeError(f"example must be an instance of {T}")
        elif not isinstance(score, ndarray):
          raise TypeError(f"score must be an instance of ndarray")
        elif not isinstance(charset_tensor, ndarray):
          raise TypeError(f"charset_tensor must be an instance of {ndarray}; found {charset_tensor.__class__}")
        # elif len(charset_tensor) != len(score):
        #   raise ValueError("charset_tensor must be the same length as score")

        self.example = example
        self.score = score
        self.charset_tensor = charset_tensor


      def __eq__(self,other):
        return self.example == other.example and np_all(self.score == other.score) and np_all(self.charset_tensor == other.charset_tensor)
      #
      def to_bytes_dict(self) -> t.Dict:
        #
        return {
        "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
        "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
        **self.example.to_bytes_dict()
        }
      #
      @classmethod
      def parse_bytes_dict(cls, record):
        parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

        score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
        parsed_record_bytes_dict["score"] = score

        charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
        parsed_record_bytes_dict["charset_tensor"] = charset_tensor

        return parsed_record_bytes_dict 

      @classmethod
      def from_parsed_bytes_dict(cls, kwargs: t.Dict):

        kwargs = {key: kwargs[key].numpy() for key in kwargs}
        score = kwargs.pop("score")
        charset_tensor = kwargs.pop("charset_tensor")

        return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)

      @classmethod
      def get_training_parser(
        cls, 
        charset_tensor: Tensor) -> t.Callable:

        return cls.record_type.get_training_parser(charset_tensor=charset_tensor)

      @classmethod
      def from_scored_batch(
        cls,
        features: ndarray,
        labels: ndarray,
        fontnames: ndarray,
        scores: ndarray,
        charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

        return cls.record_type.from_scored_batch(
          features,
          labels,
          fontnames,
          scores,
          charset_tensor)

      @classmethod
      def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

        return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)
      
    return ScoredRecord
class ScoredLabeledChar (example: TfrWritable, score: ndarray, charset_tensor: ndarray)

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Expand source code
class ScoredRecord(TfrWritable):
  #

  record_type = T

  _tfr_schema = {
    **record_type._tfr_schema, 
    **{'charset_tensor': FixedLenFeature([], tf_str),'score': FixedLenFeature([], tf_str)}
  }

  _nonbatched_scoring = T._nonbatched_scoring

  def __init__(self, example: TfrWritable, score: ndarray, charset_tensor: ndarray):
    if not isinstance(example, T):
      raise TypeError(f"example must be an instance of {T}")
    elif not isinstance(score, ndarray):
      raise TypeError(f"score must be an instance of ndarray")
    elif not isinstance(charset_tensor, ndarray):
      raise TypeError(f"charset_tensor must be an instance of {ndarray}; found {charset_tensor.__class__}")
    # elif len(charset_tensor) != len(score):
    #   raise ValueError("charset_tensor must be the same length as score")

    self.example = example
    self.score = score
    self.charset_tensor = charset_tensor


  def __eq__(self,other):
    return self.example == other.example and np_all(self.score == other.score) and np_all(self.charset_tensor == other.charset_tensor)
  #
  def to_bytes_dict(self) -> t.Dict:
    #
    return {
    "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
    "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
    **self.example.to_bytes_dict()
    }
  #
  @classmethod
  def parse_bytes_dict(cls, record):
    parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

    score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
    parsed_record_bytes_dict["score"] = score

    charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
    parsed_record_bytes_dict["charset_tensor"] = charset_tensor

    return parsed_record_bytes_dict 

  @classmethod
  def from_parsed_bytes_dict(cls, kwargs: t.Dict):

    kwargs = {key: kwargs[key].numpy() for key in kwargs}
    score = kwargs.pop("score")
    charset_tensor = kwargs.pop("charset_tensor")

    return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)

  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    return cls.record_type.get_training_parser(charset_tensor=charset_tensor)

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    return cls.record_type.from_scored_batch(
      features,
      labels,
      fontnames,
      scores,
      charset_tensor)

  @classmethod
  def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

    return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)

Ancestors

Class variables

var record_type

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Static methods

def filter_charset_for_scoring(dataset: TFRecordDataset, charset_tensor: ndarray)

This function is needed because filtering by character requires different logic for individual char images and for entire fonts.

Args

dataset : TFRecordDataset
input dataset
charset_tensor : ndarray
tensor with a single char element per charset element
Expand source code
@classmethod
def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

  return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)
def from_parsed_bytes_dict(kwargs: t.Dict)

Instantiate from a parsed bytes dict extracted from a Tensorflow record file

Args

kwargs : t.Dict
Parsed dictionary

Returns

TfrWritable

Expand source code
@classmethod
def from_parsed_bytes_dict(cls, kwargs: t.Dict):

  kwargs = {key: kwargs[key].numpy() for key in kwargs}
  score = kwargs.pop("score")
  charset_tensor = kwargs.pop("charset_tensor")

  return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)
def from_scored_batch(features: ndarray, labels: ndarray, fontnames: ndarray, scores: ndarray, charset_tensor: ndarray) ‑> Generator[LabeledChar, None, None]

Maps a batch of scored features and associated objects to a generator of TfrWritable instances. This method is necessary because labeled chars and labeled fonts differ in shape, and logic for mapping scored batches to records is different for each of them.

Args

features : ndarray
batch features; they must be preprocessed for scoring, which usually means they are in unit scale and are of type float32.
label : ndarray
batch labels
fontname : t.Union[str, ndarray]
batch fontnames
scores : ndarray
batch scores
charset_tensor : ndarray
tensor with a single char element per charset element

Returns

t.Generator[TfrWritable, None, None]
Generator of formatted records
Expand source code
@classmethod
def from_scored_batch(
  cls,
  features: ndarray,
  labels: ndarray,
  fontnames: ndarray,
  scores: ndarray,
  charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

  return cls.record_type.from_scored_batch(
    features,
    labels,
    fontnames,
    scores,
    charset_tensor)
def get_training_parser(charset_tensor: Tensor) ‑> Callable

Returns a function that maps partially parsed objects as outputted by parse_bytes_dict to a (features, label) tuple for training consumption

Args

charset_tensor : Tensor
tensor fo valid characters

Returns

t.Callable
Parser function
Expand source code
@classmethod
def get_training_parser(
  cls, 
  charset_tensor: Tensor) -> t.Callable:

  return cls.record_type.get_training_parser(charset_tensor=charset_tensor)
def parse_bytes_dict(record)

Performs basic parsing of deserialised features and returns dict with the same keys as the tfr schema's ordered dict

Args

record : tf.train.TFExample
Input record

Returns

t.Dict
Output dictionary
Expand source code
@classmethod
def parse_bytes_dict(cls, record):
  parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

  score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
  parsed_record_bytes_dict["score"] = score

  charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
  parsed_record_bytes_dict["charset_tensor"] = charset_tensor

  return parsed_record_bytes_dict 

Methods

def to_bytes_dict(self) ‑> Dict[~KT, ~VT]
Expand source code
def to_bytes_dict(self) -> t.Dict:
  #
  return {
  "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
  "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
  **self.example.to_bytes_dict()
  }
class ScoredLabeledFont (example: TfrWritable, score: ndarray, charset_tensor: ndarray)

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Expand source code
class ScoredRecord(TfrWritable):
  #

  record_type = T

  _tfr_schema = {
    **record_type._tfr_schema, 
    **{'charset_tensor': FixedLenFeature([], tf_str),'score': FixedLenFeature([], tf_str)}
  }

  _nonbatched_scoring = T._nonbatched_scoring

  def __init__(self, example: TfrWritable, score: ndarray, charset_tensor: ndarray):
    if not isinstance(example, T):
      raise TypeError(f"example must be an instance of {T}")
    elif not isinstance(score, ndarray):
      raise TypeError(f"score must be an instance of ndarray")
    elif not isinstance(charset_tensor, ndarray):
      raise TypeError(f"charset_tensor must be an instance of {ndarray}; found {charset_tensor.__class__}")
    # elif len(charset_tensor) != len(score):
    #   raise ValueError("charset_tensor must be the same length as score")

    self.example = example
    self.score = score
    self.charset_tensor = charset_tensor


  def __eq__(self,other):
    return self.example == other.example and np_all(self.score == other.score) and np_all(self.charset_tensor == other.charset_tensor)
  #
  def to_bytes_dict(self) -> t.Dict:
    #
    return {
    "charset_tensor": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.charset_tensor, dtype=tf.string)), 
    "score": self.record_type.bytes_feature(self.record_type.array_to_bytes(self.score, dtype=tf.float32)),
    **self.example.to_bytes_dict()
    }
  #
  @classmethod
  def parse_bytes_dict(cls, record):
    parsed_record_bytes_dict = cls.record_type.parse_bytes_dict(record)

    score = tf.io.parse_tensor(record["score"], out_type=tf.float32)
    parsed_record_bytes_dict["score"] = score

    charset_tensor = tf.io.parse_tensor(record["charset_tensor"], out_type=tf.string)
    parsed_record_bytes_dict["charset_tensor"] = charset_tensor

    return parsed_record_bytes_dict 

  @classmethod
  def from_parsed_bytes_dict(cls, kwargs: t.Dict):

    kwargs = {key: kwargs[key].numpy() for key in kwargs}
    score = kwargs.pop("score")
    charset_tensor = kwargs.pop("charset_tensor")

    return cls(example = cls.record_type(**kwargs), score = score, charset_tensor = charset_tensor)

  @classmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:

    return cls.record_type.get_training_parser(charset_tensor=charset_tensor)

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    labels: ndarray,
    fontnames: ndarray,
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[LabeledChar, None, None]:

    return cls.record_type.from_scored_batch(
      features,
      labels,
      fontnames,
      scores,
      charset_tensor)

  @classmethod
  def filter_charset_for_scoring(cls, dataset: TFRecordDataset, charset_tensor: ndarray):

    return cls.record_type.filter_charset_for_scoring(dataset, charset_tensor)

Ancestors

Class variables

var record_type

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Inherited members

class TfrWritable

Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package

Expand source code
class TfrWritable(ABC):

  """Class that provides Tensorflow record's encoding and decoding logic for downstream data formats used by the package
  """
  
  _tfr_schema: t.Dict

  _nonbatched_scoring: bool #if True, batch size is ignored at scoring time for this record type.

  @classmethod
  def tensor_to_numpy(cls, x: Tensor) -> ndarray:
    """Converts Tensor to numpy array
    
    Args:
        x (Tensor): Input tensor
    
    Returns:
        ndarray: numpy array
    """

    if executing_eagerly():
      return x.numpy()
    else:
      return x.eval()

  @classmethod
  def array_to_bytes(cls, x: t.Union[Tensor, ndarray], dtype: type) -> bytes:
    """Converts an array, either from numpy or Tensorflow, to a stream of bytes to be serialized
    
    Args:
        x (t.Union[Tensor, ndarray]): Input array
        dtype: type of returned tensor
    
    Returns:
        bytes: serialized array
    """

    serialised_tensor = serialize_tensor(convert_to_tensor(x, dtype=dtype))

    byte_content = cls.tensor_to_numpy(serialised_tensor)

    return byte_content

  @classmethod
  def bytes_feature(cls, value: bytes) -> TFFeature:
    """Maps a bytestream to a TF Feature instance
    
    Args:
        value (bytes): bytes to encode
    
    Returns:
        TFFeature: encoded value
    """
    return TFFeature(bytes_list=TFBytesList(value=[value]))


  @classmethod
  @abstractmethod
  def to_bytes_dict(self) -> TFFeature:
    """Maps an object inheriting from this class to a TF record compatible format
    
    Returns:
        t.Dict: dictionary with encoded features that will be stored into a TF record.
    """
    pass

  def to_tf_example(self):
    """Returns a Tensorflow example instance encoding the instance's contents
    
    """

    return TFExample(
      features = TFFeatures(feature = self.to_bytes_dict()))

  @classmethod
  def from_tf_example(cls, example: Tensor) -> t.Dict:
    """Creates an instance by deserialising a TF record using the class schema
    
    Args:
        example (TFExample): example TF example
    
    Returns:
        TfrWritable: deserialised TfrWritable instance
    """
    return parse_single_example(example,cls._tfr_schema)

  @classmethod
  def img_to_png_bytes(cls, img):
    bf = io.BytesIO()
    imageio.imwrite(bf,img.astype(uint8),"png")
    val = bf.getvalue()
    bf.close()
    return val

  def add_score(self, score: Tensor, charset_tensor: Tensor) -> TfrWritable:
    """Adds a model's score and return the appropriate record instance
    
    Args:
        score (Tensor): Model score

        charset (Tensor): charset used by the scoring model
    
    Returns:

        TfrWritable: scored record instance
    """
    return NotImplementError("Adding a score is not implemented for this schema.")

  @classmethod
  @abstractmethod
  def parse_bytes_dict(self, record):
    """Performs basic parsing of deserialised features and returns dict with the same keys as the tfr schema's ordered dict
    
    Args:
        record (tf.train.TFExample): Input record
    
    Returns:
        t.Dict: Output dictionary
    """
    pass


  @classmethod
  @abstractmethod
  def get_training_parser(
    cls, 
    charset_tensor: Tensor) -> t.Callable:
    """Returns a function that maps partially parsed objects as outputted by parse_bytes_dict to a (features, label) tuple for training consumption
    
    Args:
        charset_tensor (Tensor): tensor fo valid characters
    
    Returns:
        t.Callable: Parser function
    """
    pass

  @classmethod
  def from_parsed_bytes_dict(cls, kwargs: t.Dict):
    """Instantiate from a parsed bytes dict extracted from a Tensorflow record file
    
    Args:
        kwargs (t.Dict): Parsed dictionary
    
    Returns:
        TfrWritable
    """
    return cls(**{key: kwargs[key].numpy() for key in kwargs})

  @classmethod
  def from_scored_batch(
    cls,
    features: ndarray,
    label: ndarray,
    fontname: t.Union[str, ndarray],
    scores: ndarray,
    charset_tensor: ndarray) -> t.Generator[TfrWritable, None, None]:
    """Maps a batch of scored features and associated objects to a generator of TfrWritable instances. This method is necessary because labeled chars and labeled fonts differ in shape, and logic for mapping scored batches to records is different for each of them.
    
    Args:
        features (ndarray): batch features; they must be preprocessed for scoring, which usually means they are in unit scale and are of type float32.
        label (ndarray): batch labels
        fontname (t.Union[str, ndarray]): batch fontnames
        scores (ndarray): batch scores
        charset_tensor (ndarray): tensor with a single char element per charset element
    
    Returns:
        t.Generator[TfrWritable, None, None]: Generator of formatted records
    
    """
    return NotImplementError("This method is only implemented for subclasses")

  @classmethod
  def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):
    """This function is needed because filtering by character requires different logic for individual char images and for entire fonts.
    
    Args:
        dataset (TFRecordDataset): input dataset
        charset_tensor (ndarray): tensor with a single char element per charset element
    """

    return NotImplementError("This method is only implemented for subclasses")

Ancestors

  • abc.ABC

Subclasses

Static methods

def array_to_bytes(x: t.Union[Tensor, ndarray], dtype: type) ‑> bytes

Converts an array, either from numpy or Tensorflow, to a stream of bytes to be serialized

Args

x : t.Union[Tensor, ndarray]
Input array
dtype
type of returned tensor

Returns

bytes
serialized array
Expand source code
@classmethod
def array_to_bytes(cls, x: t.Union[Tensor, ndarray], dtype: type) -> bytes:
  """Converts an array, either from numpy or Tensorflow, to a stream of bytes to be serialized
  
  Args:
      x (t.Union[Tensor, ndarray]): Input array
      dtype: type of returned tensor
  
  Returns:
      bytes: serialized array
  """

  serialised_tensor = serialize_tensor(convert_to_tensor(x, dtype=dtype))

  byte_content = cls.tensor_to_numpy(serialised_tensor)

  return byte_content
def bytes_feature(value: bytes) ‑> tensorflow.core.example.feature_pb2.Feature

Maps a bytestream to a TF Feature instance

Args

value : bytes
bytes to encode

Returns

TFFeature
encoded value
Expand source code
@classmethod
def bytes_feature(cls, value: bytes) -> TFFeature:
  """Maps a bytestream to a TF Feature instance
  
  Args:
      value (bytes): bytes to encode
  
  Returns:
      TFFeature: encoded value
  """
  return TFFeature(bytes_list=TFBytesList(value=[value]))
def filter_charset_for_scoring(dataset: TFRecordDataset, charset_tensor: ndarray)

This function is needed because filtering by character requires different logic for individual char images and for entire fonts.

Args

dataset : TFRecordDataset
input dataset
charset_tensor : ndarray
tensor with a single char element per charset element
Expand source code
@classmethod
def filter_charset_for_scoring(self, dataset: TFRecordDataset, charset_tensor: ndarray):
  """This function is needed because filtering by character requires different logic for individual char images and for entire fonts.
  
  Args:
      dataset (TFRecordDataset): input dataset
      charset_tensor (ndarray): tensor with a single char element per charset element
  """

  return NotImplementError("This method is only implemented for subclasses")
def from_parsed_bytes_dict(kwargs: t.Dict)

Instantiate from a parsed bytes dict extracted from a Tensorflow record file

Args

kwargs : t.Dict
Parsed dictionary

Returns

TfrWritable

Expand source code
@classmethod
def from_parsed_bytes_dict(cls, kwargs: t.Dict):
  """Instantiate from a parsed bytes dict extracted from a Tensorflow record file
  
  Args:
      kwargs (t.Dict): Parsed dictionary
  
  Returns:
      TfrWritable
  """
  return cls(**{key: kwargs[key].numpy() for key in kwargs})
def from_scored_batch(features: ndarray, label: ndarray, fontname: t.Union[str, ndarray], scores: ndarray, charset_tensor: ndarray) ‑> Generator[TfrWritable, None, None]

Maps a batch of scored features and associated objects to a generator of TfrWritable instances. This method is necessary because labeled chars and labeled fonts differ in shape, and logic for mapping scored batches to records is different for each of them.

Args

features : ndarray
batch features; they must be preprocessed for scoring, which usually means they are in unit scale and are of type float32.
label : ndarray
batch labels
fontname : t.Union[str, ndarray]
batch fontnames
scores : ndarray
batch scores
charset_tensor : ndarray
tensor with a single char element per charset element

Returns

t.Generator[TfrWritable, None, None]
Generator of formatted records
Expand source code
@classmethod
def from_scored_batch(
  cls,
  features: ndarray,
  label: ndarray,
  fontname: t.Union[str, ndarray],
  scores: ndarray,
  charset_tensor: ndarray) -> t.Generator[TfrWritable, None, None]:
  """Maps a batch of scored features and associated objects to a generator of TfrWritable instances. This method is necessary because labeled chars and labeled fonts differ in shape, and logic for mapping scored batches to records is different for each of them.
  
  Args:
      features (ndarray): batch features; they must be preprocessed for scoring, which usually means they are in unit scale and are of type float32.
      label (ndarray): batch labels
      fontname (t.Union[str, ndarray]): batch fontnames
      scores (ndarray): batch scores
      charset_tensor (ndarray): tensor with a single char element per charset element
  
  Returns:
      t.Generator[TfrWritable, None, None]: Generator of formatted records
  
  """
  return NotImplementError("This method is only implemented for subclasses")
def from_tf_example(example: Tensor) ‑> Dict[~KT, ~VT]

Creates an instance by deserialising a TF record using the class schema

Args

example : TFExample
example TF example

Returns

TfrWritable
deserialised TfrWritable instance
Expand source code
@classmethod
def from_tf_example(cls, example: Tensor) -> t.Dict:
  """Creates an instance by deserialising a TF record using the class schema
  
  Args:
      example (TFExample): example TF example
  
  Returns:
      TfrWritable: deserialised TfrWritable instance
  """
  return parse_single_example(example,cls._tfr_schema)
def get_training_parser(charset_tensor: Tensor) ‑> Callable

Returns a function that maps partially parsed objects as outputted by parse_bytes_dict to a (features, label) tuple for training consumption

Args

charset_tensor : Tensor
tensor fo valid characters

Returns

t.Callable
Parser function
Expand source code
@classmethod
@abstractmethod
def get_training_parser(
  cls, 
  charset_tensor: Tensor) -> t.Callable:
  """Returns a function that maps partially parsed objects as outputted by parse_bytes_dict to a (features, label) tuple for training consumption
  
  Args:
      charset_tensor (Tensor): tensor fo valid characters
  
  Returns:
      t.Callable: Parser function
  """
  pass
def img_to_png_bytes(img)
Expand source code
@classmethod
def img_to_png_bytes(cls, img):
  bf = io.BytesIO()
  imageio.imwrite(bf,img.astype(uint8),"png")
  val = bf.getvalue()
  bf.close()
  return val
def parse_bytes_dict(record)

Performs basic parsing of deserialised features and returns dict with the same keys as the tfr schema's ordered dict

Args

record : tf.train.TFExample
Input record

Returns

t.Dict
Output dictionary
Expand source code
@classmethod
@abstractmethod
def parse_bytes_dict(self, record):
  """Performs basic parsing of deserialised features and returns dict with the same keys as the tfr schema's ordered dict
  
  Args:
      record (tf.train.TFExample): Input record
  
  Returns:
      t.Dict: Output dictionary
  """
  pass
def tensor_to_numpy(x: Tensor) ‑> numpy.ndarray

Converts Tensor to numpy array

Args

x : Tensor
Input tensor

Returns

ndarray
numpy array
Expand source code
@classmethod
def tensor_to_numpy(cls, x: Tensor) -> ndarray:
  """Converts Tensor to numpy array
  
  Args:
      x (Tensor): Input tensor
  
  Returns:
      ndarray: numpy array
  """

  if executing_eagerly():
    return x.numpy()
  else:
    return x.eval()
def to_bytes_dict() ‑> tensorflow.core.example.feature_pb2.Feature

Maps an object inheriting from this class to a TF record compatible format

Returns

t.Dict
dictionary with encoded features that will be stored into a TF record.
Expand source code
@classmethod
@abstractmethod
def to_bytes_dict(self) -> TFFeature:
  """Maps an object inheriting from this class to a TF record compatible format
  
  Returns:
      t.Dict: dictionary with encoded features that will be stored into a TF record.
  """
  pass

Methods

def add_score(self, score: Tensor, charset_tensor: Tensor) ‑> TfrWritable

Adds a model's score and return the appropriate record instance

Args

score : Tensor
Model score
charset : Tensor
charset used by the scoring model

Returns

TfrWritable
scored record instance
Expand source code
def add_score(self, score: Tensor, charset_tensor: Tensor) -> TfrWritable:
  """Adds a model's score and return the appropriate record instance
  
  Args:
      score (Tensor): Model score

      charset (Tensor): charset used by the scoring model
  
  Returns:

      TfrWritable: scored record instance
  """
  return NotImplementError("Adding a score is not implemented for this schema.")
def to_tf_example(self)

Returns a Tensorflow example instance encoding the instance's contents

Expand source code
def to_tf_example(self):
  """Returns a Tensorflow example instance encoding the instance's contents
  
  """

  return TFExample(
    features = TFFeatures(feature = self.to_bytes_dict()))