# Import libraries

In [0]:
import os, sys
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Convolution2D, MaxPooling2D
from tensorflow.keras.layers import Dropout, Flatten, Dense, Activation
from tensorflow.keras import optimizers
from tensorflow.keras import backend as K
import sklearn
# Metrics for RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
# To export Randomized Search results to CSV
import csv

# Scikit-lean version

We need scikit-learn not to be 0.22.2. If that's the one we have, downgrade to 0.21.2 is required.

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.22.2.post1.


In [3]:
!pip install scikit-learn==0.21.2

Collecting scikit-learn==0.21.2
[?25l  Downloading https://files.pythonhosted.org/packages/85/04/49633f490f726da6e454fddc8e938bbb5bfed2001681118d3814c219b723/scikit_learn-0.21.2-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
[K     |████████████████████████████████| 6.7MB 3.3MB/s 
Installing collected packages: scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.21.2


# Mount Google Drive

In [2]:
# Access Google Drive
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


# Function to load training data

In [0]:
def load_dataset(train_folder, image_height, image_width, channels):

  # gets list of .jpg in folder
  jpg_files = [f for f in os.listdir(train_folder) if os.path.isfile(os.path.join(train_folder, f)) and f.endswith('.jpg')]

  image_files = []
  labels = []

  for file in jpg_files:
    # Files starting with H or N are non-PM: class 0
    if file.startswith('H') or file.startswith('N'):
      image_files.append(file)
      labels.append(0)
    # Files starting with P are PM: class 1
    elif file.startswith('P'):
      image_files.append(file)
      labels.append(1)
    else:
      print("Wrong label, imagen will be discarded.")
    
  images = np.ndarray(shape=(len(image_files), image_height, image_width, channels),
                      dtype=np.float32)
  idx = 0

  for file in image_files:
    image = load_img(train_folder + "/" + file, target_size=(image_height, image_width))
    image_array = img_to_array(image)
    images[idx] = image_array
    idx += 1

  # Convert label list to numpy array
  labels = np.array(labels)

  return images, labels

# Function to create and compile model

In [0]:
def get_model(input_shape, loss, optimizer, metrics, learning_rate, momentum, dropout, n_conv_layers, n_filters_layer_1, size_filters_layer_1, maxpool_size_1, n_filters_layer_n, size_filters_layer_n, maxpool_size_n):

  if optimizer == tf.keras.optimizers.SGD:
    optimizer = optimizer(learning_rate=learning_rate, momentum=momentum)
  elif optimizer == tf.keras.optimizers.Adam:
    optimizer = optimizer(learning_rate=learning_rate)
  
  K.clear_session()
  
  model = Sequential()

  model.add(Convolution2D(n_filters_layer_1, 
                        size_filters_layer_1, 
                        padding ="same", 
                        input_shape=input_shape, 
                        activation='relu'))
  model.add(MaxPooling2D(pool_size=maxpool_size_1))

  # Add more conv+MaxPool layers if n_layers > 1
  for i in range(1, n_conv_layers):
    model.add(Convolution2D(n_filters_layer_n, 
                          size_filters_layer_n, 
                          padding ="same"))
    model.add(MaxPooling2D(pool_size=maxpool_size_n))

  model.add(Flatten())
  model.add(Dense(256, 
                activation='relu'))
  model.add(Dropout(dropout))
  model.add(Dense(1, 
                activation='sigmoid'))
  
  # Compile
  model.compile(loss=loss,
              optimizer=optimizer,
              metrics=metrics)

  model.summary()

  return model

# Variables

In [0]:
# Size of images
image_height, image_width = (256, 256)
# Channels of images: 3 for RGB
channels = 3
# Path to images for training
train_folder = '/gdrive/My Drive/TFG Daniel López Robles/train/'
# Extension of the images of our dataset
file_extesion = '.jpg'
# Number of images for training
n_observations = len([f for f in os.listdir(train_folder) if os.path.isfile(os.path.join(train_folder, f)) and f.endswith('.jpg')])
# CSV to export Randomized Search results
csv_results = '/gdrive/My Drive/TFG Daniel López Robles/csv_randomized_search/256x256.csv'

# Number of folds for stratified k-fold cross validation
CVfolds = 5

# Path to save models
models_path = '/gdrive/My Drive/TFG Daniel López Robles/modelos/'

# callbacks
#checkpoint_cb = keras.callbacks.ModelCheckpoint(models_path+'cnn_cross-validation_checkpoint_20200514_1739.h5',
#                                                save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=7,
                                                  restore_best_weights=True)  
callbacks_list = [early_stopping_cb]

# Varaibles for Randomized Search
# Metrics for RandomizedSearchCV scores
#scoring = ['accuracy', 'precision', 'recall', 'roc_auc']
scoring = 'accuracy'
# Number of parameter settings that are sampled
n_iterations = 10
param_distribs = {'input_shape': [(image_height, image_width, channels)],
               'loss': ['binary_crossentropy'],
               'optimizer': [tf.keras.optimizers.Adam, tf.keras.optimizers.SGD],
               'metrics': [[
                           # keras.metrics.TruePositives(name='tp'), keras.metrics.FalsePositives(name='fp'),
                           #keras.metrics.TrueNegatives(name='tn'),
                           #keras.metrics.FalseNegatives(name='fn'),
                           keras.metrics.BinaryAccuracy(name='accuracy'),
                           #keras.metrics.Precision(name='precision'),
                           #keras.metrics.Recall(name='recall'),
                           #keras.metrics.AUC(name='auc')
                           ]],
               'learning_rate': [0.001, 0.01, 0.1, 0.3],
               'momentum': [0.0, 0.4, 0.8],
               'epochs': [15, 30, 50],
               'batch_size': [2, 4, 8, 16, 32],
               'dropout': [0.3, 0.5, 0.7],
               'n_conv_layers': [1, 2, 3],
               'n_filters_layer_1': [16, 32, 64],
               'size_filters_layer_1': [(5, 5), (7, 7)],
               'maxpool_size_1': [(2, 2), (3, 3)],
               'n_filters_layer_n': [16, 32, 64],
               'size_filters_layer_n': [(2, 2), (3, 3)],
               'maxpool_size_n': [(2, 2)]
               }

# 1. Load dataset

In [15]:
# Load dataset
images, labels = load_dataset(train_folder, image_height, image_width, channels)
print("images shape: {} \nlabels shape: {}".format(images.shape, labels.shape))

images shape: (400, 256, 256, 3) 
labels shape: (400,)


# 2. Run Randomized Search and print metrics

In [17]:
keras_classifier = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=get_model)
random_search = sklearn.model_selection.RandomizedSearchCV(keras_classifier, 
                                                           param_distribs, 
                                                           cv=CVfolds, 
                                                           scoring=scoring,
                                                           n_iter=n_iterations)

random_search.fit(images, labels)

print(random_search.best_params_)

means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
params = random_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

#export results to CSV
with open(csv_results, 'a') as f:
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f), %r" % (mean, stdev, param), file=f)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 32)      4736      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 32)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 128, 128, 16)      2064      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 64, 16)        1040      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 32, 16)        0         
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0

  return (proba > 0.5).astype('int32')


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 64)      9472      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 85, 85, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 85, 85, 32)        8224      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 42, 42, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 42, 42, 32)        4128      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 21, 21, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 14112)             0

In [11]:
random_search.best_params_

{'batch_size': 32,
 'dropout': 0.5,
 'epochs': 15,
 'input_shape': (128, 128, 3),
 'learning_rate': 0.01,
 'loss': 'binary_crossentropy',
 'maxpool_size_1': (3, 3),
 'maxpool_size_n': (2, 2),
 'metrics': [<tensorflow.python.keras.metrics.BinaryAccuracy at 0x7f245de06588>],
 'momentum': 0.4,
 'n_conv_layers': 2,
 'n_filters_layer_1': 16,
 'n_filters_layer_n': 16,
 'optimizer': tensorflow.python.keras.optimizer_v2.adam.Adam,
 'size_filters_layer_1': (7, 7),
 'size_filters_layer_n': (2, 2)}

In [18]:
means = random_search.cv_results_['mean_test_score']
stds = random_search.cv_results_['std_test_score']
params = random_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.935000 (0.012247) with: {'size_filters_layer_n': (2, 2), 'size_filters_layer_1': (7, 7), 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.adam.Adam'>, 'n_filters_layer_n': 16, 'n_filters_layer_1': 32, 'n_conv_layers': 3, 'momentum': 0.4, 'metrics': [<tensorflow.python.keras.metrics.BinaryAccuracy object at 0x7f24be57c668>], 'maxpool_size_n': (2, 2), 'maxpool_size_1': (2, 2), 'loss': 'binary_crossentropy', 'learning_rate': 0.001, 'input_shape': (256, 256, 3), 'epochs': 50, 'dropout': 0.5, 'batch_size': 32}
0.557500 (0.056236) with: {'size_filters_layer_n': (2, 2), 'size_filters_layer_1': (7, 7), 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD'>, 'n_filters_layer_n': 32, 'n_filters_layer_1': 64, 'n_conv_layers': 3, 'momentum': 0.4, 'metrics': [<tensorflow.python.keras.metrics.BinaryAccuracy object at 0x7f24be57c668>], 'maxpool_size_n': (2, 2), 'maxpool_size_1': (3, 3), 'loss': 'binary_crossentropy', 'learning_rate': 0.01, 'input_shape': (256, 256