Tensorflow dataset pipeline

Useful resources for learning and creating a Tensorflow Dataset. See also code snippets below.

Building a data pipeline: https://cs230.stanford.edu/blog/datapipeline/
tf.data API, Build TensorFlow input pipelines: https://www.tensorflow.org/guide/data
tf.data API, Consuming sets of files: https://www.tensorflow.org/guide/data#consuming_sets_of_files
Better performance with the tf.data API: https://www.tensorflow.org/guide/data_performance
Keras Sequence Generator: https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
Loading and Preprocessing Data with TensorFlow: https://canvas.education.lu.se/courses/3766/pages/chapter-13-loading-and-preprocessing-data-with-tensorflow?module_item_id=109789
tf.data.Dataset generators with parallelization: the easy way: https://medium.com/@acordier/tf-data-dataset-generators-with-parallelization-the-easy-way-b5c5f7d2a18

Feed numpy files (.npz) which contain features X and a label y:

import tensorflow as tf
import numpy as np

def read_npy_file(item):
    x, y = np.load(item.decode())
    return x.astype(np.float32), 

file_list = ['/foo/bar.npz', '/foo/baz.npz']

dataset = tf.data.Dataset.from_tensor_slices(file_list)

dataset = dataset.map(
    lambda item: tuple(
        tf.py_func(func=read_npy_file, inp=[item], Tout=[tf.float32,])
    )
)



# Read numpy files (.npz), extract labels and return a new tf.data.Dataset
def get_dataset(file_names_list, num_classes=2):
    """Creates a new TensorFlow Dataset
    ----------
    Parameters:
        file_names_list: list of file paths
        num_classes: int
    
    Returns:
        (Tensor, Tensor)
    """
    # Load the numpy files
    def map_func(file_path):
        np_data = np.load(file_path)
        x_data = np_data["x"]
        y_label = np_data["y"]
        return x_data.astype(np.float32), tf.one_hot(indices=y_label, depth=num_classes)
    
    # Map function
    numpy_func = lambda item: tf.numpy_function(map_func, [item], [tf.float32, tf.float32])
    
    # Create a new tensorflow dataset
    dataset = tf.data.Dataset.from_tensor_slices(file_list)
    
    # Use map to load the numpy files in parallel
    dataset = dataset.map(numpy_func, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

The following code snippet has been taken from https://medium.com/@acordier/tf-data-dataset-generators-with-parallelization-the-easy-way-b5c5f7d2a18

import numpy as np
import tensorflow as tf
from custom_utils import create_keras_model


def create_tf_dataset(file_name_list, x_shape, y_shape, batch_size=8, shuffle_dataset=True):
    """Creates a new TensorFlow Dataset for the given list of file names

    Parameter
    --------------
    file_name_list : list, np.array
        A list containing full file paths

    Returns
    --------------
    dataset: tf.data.Dataset
    """

    def processing_func(file_path):
        """Loads the saved numpy array. Returns features and a label"""
        np_data = np.load(file_path)
        x_data = np_data["x"]
        x_data = x_data.astype(np.float32)
        y_label = int(np_data["y"])
        y_label = tf.one_hot(indices=y_label, depth=y_shape[1], dtype=tf.uint8)
        return x_data, y_label

    def func(i):
        i = i.numpy()  # Decoding from the EagerTensor object
        x_data, y_label = processing_func(file_name_list[i])
        return x_data, y_label

    def _fixup_shape(x_data, y_label):
        x_data.set_shape([None, x_shape[1], x_shape[2]])  # n, h, w, c
        y_label.set_shape([None, y_shape[1]])  # n, nb_classes
        return x_data, y_label

    # Data preparation
    z = list(range(len(file_name_list)))

    dataset = tf.data.Dataset.from_generator(lambda: z, tf.uint32)
    
    if shuffle_dataset is True:
        print("shuffling")
        dataset = dataset.shuffle(buffer_size=len(z),
                                  seed=SEED,
                                  reshuffle_each_iteration=True)
    
    dataset = dataset.map(lambda i: tf.py_function(func=func,
                                                   inp=[i],
                                                   Tout=[tf.float32, tf.uint8]),
                          num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(_fixup_shape)
    dataset = dataset.prefetch(1)
    return dataset

# Create a list of file paths
training_files = ["full/path/to/file_001.npy", "full/path/to/file_012.npy", ...]
val_files = ["full/path/to/file_502.npy", "full/path/to/file_150.npy", ...]
test_files = ["full/path/to/file_411.npy", "full/path/to/file_590.npy", ...]

train_dataset = create_tf_dataset(training_files)
val_dataset = create_tf_dataset(val_files)
test_dataset = create_tf_dataset(test_files)

model = create_keras_model()

model.fit(x=train_dataset,
          validation_data=val_dataset,
          epochs=100)

model.evaluate(test_dataset, return_dict=True)

Source: How to extract classes from prefetched dataset in Tensorflow for confusion matrix

Disclaimer: the following solution won’t work for shuffled Tensorflow datasets (tensorflow.data.Dataset.shuffle).

import tensorflow_datasets as tfds
import tensorflow as tf
from sklearn.metrics import confusion_matrix

data, info = tfds.load(
    'iris', 
    split='train',
    as_supervised=True,
    shuffle_files=True,
    with_info=True
)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = data.take(120).batch(4).prefetch(buffer_size=AUTOTUNE)
test_dataset = data.skip(120).take(30).batch(4).prefetch(buffer_size=AUTOTUNE)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(info.features['label'].num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')

history = model.fit(train_dataset, validation_data=test_dataset, epochs=50, verbose=0)

y_pred = model.predict(test_dataset)

predicted_categories = tf.argmax(y_pred, axis=1)

true_categories = tf.concat([y for x, y in test_dataset], axis=0)

confusion_matrix(predicted_categories, true_categories)