Machine Learning: Data Preprocessing

I have already started getting into deep learning, and been alloted some great tasks to carry out for the intended projects in AGV. Now when it comes to deep learning, its all about collecting dataset, which is first of all, erroneous, and again different machine learning libraries demand the dataset in various formats. But, there also exists some libraries that can directly allow importing images straight from the directory, EX. Caffe. On other hand, libraries like Tensorflow or Torch, restricts this and generally uses Tensor or numpy “n” dimensional array, for faster data calling and embedding.

Currently, I am involved with two libraries Caffe and Tensorflow in two different projects where I had faced some issue regarding above dataset preprocessing requirements. So, I sort of developed a python code which when applied on class-wise classified dataset, returns pickle files individually for each class and also provides various options like Tensor or ndarray reformating (as according to Convnet input), merged pickle file generation, etc. The only explicit requirement for successfully running this code is that dataset should be segregated in classes wise order in different directories, which I think is followed by most of the datasets found online. Take the example of MNIST, then notMNIST, another famous one is cifar-10, cifar-100, and for driverless cars, we have GTSRB (Traffic sign recognition), GTSDB (Traffic sign detection) from University of Informatics. Imagenet provides more than 1000 classes in a similar way.

The code is self explanatory. Still here is a brief about the pipeline.

# Collect the images, classes wise and Normalize. By normalization, I mean getting its mean close to zero and standard deviation close to 0.5. Normalization is very useful for optimizers to converge quickly.

# Create a ndarray using numpy of set of Images. I have taken classes as set, and have loaded them into individual pickle file.

# Merge the pickle files as according to requirement. Most of the times, only a single chunk of dataset is available and it has to be classified into validation set, training set, and test set. This has been taken care of in the code. Just mention the proportion of volume required in each set before code execution

# Give parameters for reformatting, incase you need one.

Here is the link to the github repository.

# Data outputs ".pickle" file which is composed of 3D tensors of images collected. The input has to be given in separate classes # in form of one folder per class in dataset diretory. Output is well shuffled. 
# Output shape of images set :( num_images, image_size, image_size)
# Output shape of labels set :( num_images, num_classes)

#################################
# OPTIONAL REFORMATTING
#################################
# For Images: ( num_images, image_size*image_size) 
# and labels: ( num_images, num_classes)
# num_classes vector will be one hot encoded
# Do not forget to write "single pickle merger" block of code after reformatting

# Further read "pickle_loader.py" to use it directly in network
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle


num_labels = 10  # Number of classes.
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

# Define path to dataset directory. agrv[1] is name of folder
DATA_DIR = "/home/deeplearning/work/tensorflow/datasets/" + sys.argv[1]
trainset_name = sys.argv[2]             # argv[2] is subfolder name for training set
testset_name= sys.argv[3]               # argv[3] is subfolder name for test set

def load_letter(folder, data_path, min_num_images):
  """Load the data for a single letter label."""
  folder = os.path.join(data_path, folder)
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (ndimage.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :] = image_data
      num_images = num_images + 1
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_path, min_num_images_per_class, force=False):
  dataset_names = []
  data_folders = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
  for folder in data_folders:
    print(folder)
    set_filename = folder + '.pickle'
    dataset_names.append(os.path.join(data_path, set_filename))
    if os.path.exists(os.path.join(data_path, set_filename)) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, data_path, min_num_images_per_class)
      try:
        with open(data_path + '/' + set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names



def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
  num_classes = len(pickle_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, pickle_file in enumerate(pickle_files):       
    try:
      with open(pickle_file, 'rb') as f:
        letter_set = pickle.load(f)
        # let's shuffle the letters to have random validation and training set
        np.random.shuffle(letter_set)
        if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
        train_letter = letter_set[vsize_per_class:end_l, :, :]
        train_dataset[start_t:end_t, :, :] = train_letter
        train_labels[start_t:end_t] = label
        start_t += tsize_per_class
        end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', pickle_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels



def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels


### REFORMATER
def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...] one hot encoder
  #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels

########################## Individial class pickle creator ##################

train_datasets = maybe_pickle(os.path.join(DATA_DIR, trainset_name), 45000)
test_datasets = maybe_pickle(os.path.join(DATA_DIR, testset_name), 1800)

########################## pickle merger to train, test, valid pickles ###########

train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

# Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...] one hot encoder
train_labels = (np.arange(num_labels) == train_labels[:,None]).astype(np.float32)
test_labels = (np.arange(num_labels) == test_labels[:,None]).astype(np.float32)
valid_labels = (np.arange(num_labels) == valid_labels[:,None]).astype(np.float32)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

########################### Single pickle merger ###########################

pickle_file = os.path.join(DATA_DIR, 'notMNIST.pickle')

try:
  f = open(pickle_file, 'wb')

  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

###########################
# OPTIONAL REFORMATTING
###########################
'''
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
'''
###########################

 

After successfully running the code, a pickle file can be seen generated into the same dataset directory. Now for loading a pickle into code, I modified a online available code to make it versatile. It justs loads pickle file into ndarray

# Given code block helps in unloading single pickle dataset to train, test and valid sets. 
# Option of reformating is available

#################################
# OPTIONAL REFORMATING
#################################
# Use reformating function only when output needed as flat matrix. For Images: ( num_images, image_size*image_size) 
# and labels: ( num_images, num_classes)
# num_classes vector will be one hot encoded

from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

image_size = 28
num_labels = 10

DATA_DIR = "/home/deeplearning/work/tensorflow/datasets/" + sys.argv[1]

pickle_file = os.path.join(DATA_DIR, 'notMNIST.pickle')

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)



######################## REFORMATER #######################
'''
def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
'''

Please comment incase you feel something wrong within this code. Also suggestion are most welcomed.

Comments are closed.