The aim of this IPython notebook is to show some features of the Python Theano library in the field of machine learning. It has been developped by the LISA group at the University of Montreal (see: http://deeplearning.net/software/theano/). The notebook also relies on other standard Python libraries such as numpy, pandas and matplotlib.
To exemplify the use of Theano, this notebook solves the assignments of the Machine Learning MOOC provided by Coursera (see: https://www.coursera.org/learn/machine-learning) and performed in Stanford University by Andrew Ng (see: http://www.andrewng.org/).
The original MOOC assignments should to be programmed with the Octave language (see: https://www.gnu.org/software/octave/).The idea with this notebook is to provide Python developpers with interesting examples programmed using Theano.
This notebook has been developped using the Anaconda Python 3.4 distribution provided by Continuum Analytics (see: https://www.continuum.io/). It requires the Jupyter Notebook (see: http://jupyter.org/).
import theano
import numpy as np
import theano.tensor as T
from theano.tensor.nnet import sigmoid
import scipy.io as sio
data = sio.loadmat('data/ex5data1.mat')
X, Y = data['X'], data['y'].T[0]
Xval, Yval = data['Xval'], data['yval'].T[0]
Xtest, Ytest = data['Xtest'], data['ytest'].T[0]
mu = X.mean()
sigma = X.std()
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8,6))
plt.scatter(X, Y, s=40, marker='x', color='r')
plt.xlabel('Change in water level (x)', fontsize=9)
plt.ylabel('Water flowing out of the dam (y)', fontsize=9)
plt.axis((-50, 40, 0, 40));
m = X.shape[0]
X1 = np.c_[np.ones(m), X]
X1 = X1.T
# Choose some alpha and lambda values
alpha = 0.1
lambda_ = 1.0
# Init Theta and Run Gradient Descent
t = np.ones(2)
theta = theano.shared(t,name='theta')
x = T.matrix('x')
y = T.vector('y')
prediction = T.dot(theta,x)
reg = lambda_ * T.sum(theta[1:]**2) /(2*m)
cost = T.sum(T.pow(prediction-y,2))/(2*m) + reg
grad = T.grad(cost,theta)
train = theano.function([x, y], [cost, grad], updates = [(theta,theta-alpha*grad)])
costM, gradM = train(X1, Y)
print('Cost at theta = [1 ; 1]: %f\n' % costM)
print('Gradient at theta = [1 ; 1]:\n')
print(gradM)
def linear_fit(X, Y, alpha, lambda_, iterations):
m = Y.shape[0]
# normalize data
mu = X.mean()
sigma = X.std()
if sigma == 0:
sigma = 1
Xnorm = np.c_[np.ones(m), (X - mu)/sigma] # Add intercept term to X
Xnorm = Xnorm.T
# Init Theta and Run Gradient Descent
t = np.ones(Xnorm.shape[0])
theta = theano.shared(t,name='theta')
x = T.matrix('x')
y = T.vector('y')
prediction = T.dot(theta,x)
reg = lambda_ * T.sum(theta[1:]**2) /(2*m)
cost = T.sum(T.pow(prediction-y,2))/(2*m) + reg
grad = T.grad(cost, theta)
train = theano.function([x, y], cost, updates = [(theta, theta-alpha*grad)])
test = theano.function([x], prediction)
for i in range(iterations):
costM = train(Xnorm, Y)
if i==0:
costM0 = costM
return(costM0, costM, theta.get_value(), test)
lambda_ = 0.0
costM0, costM, theta, test = linear_fit(X, Y, 0.1, lambda_, 5000)
print("Cost(%i):" % 0)
print(costM0)
print("\nCost(%i):" % 5000)
print(costM)
print(theta)
# Plot the linear fit
a = np.linspace(-48,38,2)
b = test(np.matrix([np.ones(a.shape), (a - mu)/sigma]))
plt.figure(figsize=(8,6))
plt.plot(a,b)
plt.scatter(X, Y, s=40, marker='x', color='r')
plt.xlabel('Change in water level (x)', fontsize=9)
plt.ylabel('Water flowing out of the dam (y)', fontsize=9)
plt.axis((-50, 40, -5, 40));
m_val = Xval.shape[0]
error_train = np.zeros(m)
error_val = np.zeros(m)
# normalize val data
mu_val = Xval.mean()
sigma_val = Xval.std()
Xnorm_val = np.c_[np.ones(m_val), (Xval - mu_val)/sigma_val] # Add intercept term to X
Xnorm_val = Xnorm_val.T
print('# Training Examples\tTrain Error\tCross Validation Error')
for j in range(0, m):
costM0, costM, theta, test = linear_fit(X[:j+1], Y[:j+1], 0.1, 0.0, 5000)
error_train[j] = costM
error_val[j] = np.sum((test(Xnorm_val)-Yval)**2)/(2*m_val)
print('\t%i\t\t%f\t%f' % (j+1, costM, error_val[j]))
# Plot the learning curve
fig = plt.figure(figsize=(8,6))
plt.plot(range(1,m+1),error_train, label='Train')
plt.plot(range(1,m+1),error_val, label='Cross Validation')
fig.suptitle('Learning curve for linear regression')
plt.xlabel('Number of training examples', fontsize=9)
plt.ylabel('Error', fontsize=9)
plt.legend(loc=1, fontsize=9)
plt.yticks(range(0,200,50))
plt.axis((0, 13, 0, 150));
# map X to X, X^2, X^3... X^p
def poly_features(X, p):
poly = np.concatenate([np.power(X, i) for i in range(1, p+1)])
return poly.reshape(p, X.shape[0]).T
def poly_fit(X, Y, power, alpha, lambda_, iterations):
m = Y.shape[0]
# normalize data
mu = X.mean()
sigma = X.std()
if sigma == 0:
sigma = 1
# applying polynomial features
Xpoly = poly_features((X - mu)/sigma, power)
Xnorm = np.c_[np.ones(m), Xpoly] # Add intercept term to X
Xnorm = Xnorm.T
# Init Theta and Run Gradient Descent
t = np.ones(Xnorm.shape[0])
theta = theano.shared(t,name='theta')
x = T.matrix('x')
y = T.vector('y')
prediction = T.dot(theta,x)
reg = lambda_ * T.sum(theta[1:]**2) /(2*m)
cost = T.sum(T.pow(prediction-y,2))/(2*m) + reg
grad = T.grad(cost, theta)
train = theano.function([x, y], cost, updates = [(theta, theta-alpha*grad)])
test = theano.function([x], prediction)
for i in range(iterations):
costM = train(Xnorm, Y)
if i==0:
costM0 = costM
return(costM0, costM, theta.get_value(), test)
lambda_ = 0.0
costM0, costM, theta, test = poly_fit(X, Y, 8, 0.01, lambda_, 5000)
print("Cost(%i):" % 0)
print(costM0)
print("\nCost(%i):" % 5000)
print(costM)
print(theta)
# Plot the polynomial fit
a = np.linspace(-60,60,30)
a_poly = poly_features((a - mu)/sigma, 8)
b = test(np.c_[np.ones(a_poly.shape[0]), a_poly].T)
fig = plt.figure(figsize=(8,6))
plt.plot(a,b, ls='--')
plt.scatter(X, Y, s=40, marker='x', color='r')
fig.suptitle('Polynomial Regression Fit (lambda = %f)' % lambda_)
plt.xlabel('Change in water level (x)', fontsize=9)
plt.ylabel('Water flowing out of the dam (y)', fontsize=9)
plt.axis((-80, 80, -60, 40));
def learning_curve(X, Y, Xval, Yval, power, alpha, lambda_, iterations):
m = Y.shape[0]
m_val = Yval.shape[0]
error_train = np.zeros(m)
error_val = np.zeros(m)
# normalize val data
mu_val = Xval.mean()
sigma_val = Xval.std()
# applying polynomial features
Xpoly_val = poly_features((Xval - mu_val)/sigma_val, power)
Xnorm_val = np.c_[np.ones(m_val), Xpoly_val] # Add intercept term to X
Xnorm_val = Xnorm_val.T
print('# Training Examples\tTrain Error\tCross Validation Error')
for j in range(0, m):
costM0, costM, theta, test = poly_fit(X[:j+1], Y[:j+1], power, alpha, lambda_, iterations)
error_train[j] = costM
error_val[j] = np.sum((test(Xnorm_val)-Yval)**2)/(2*m_val)
print('\t%i\t\t%f\t%f' % (j+1, costM, error_val[j]))
return error_train, error_val
lambda_ = 0.0
error_train, error_val = learning_curve(X, Y, Xval, Yval, 8, 0.001, lambda_, 5000)
# Plot the learning curve
fig = plt.figure(figsize=(8,6))
plt.plot(range(1,m+1),error_train, label='Train')
plt.plot(range(1,m+1),error_val, label='Cross Validation')
fig.suptitle('Polynomial Regression Learning Curve (lambda =%f)' % lambda_)
plt.xlabel('Number of training examples', fontsize=9)
plt.ylabel('Error', fontsize=9)
plt.legend(loc=1, fontsize=9)
plt.yticks(range(0,200,50))
plt.axis((0, 13, 0, 150));
$\lambda = 5.0$
lambda_ = 5
costM0, costM, theta, test = poly_fit(X, Y, 8, 0.01, lambda_, 5000)
print("Cost(%i):" % 0)
print(costM0)
print("\nCost(%i):" % 5000)
print(costM)
print(theta)
# Plot the polynomial fit
a = np.linspace(-80,60,40)
a_poly = poly_features((a - mu)/sigma, 8)
b = test(np.c_[np.ones(a_poly.shape[0]), a_poly].T)
fig = plt.figure(figsize=(8,6))
plt.plot(a,b, ls='--')
plt.scatter(X, Y, s=40, marker='x', color='r')
fig.suptitle('Polynomial Regression Fit (lambda = %f)' % lambda_)
plt.xlabel('Change in water level (x)', fontsize=9)
plt.ylabel('Water flowing out of the dam (y)', fontsize=9)
plt.axis((-80, 80, 0, 160));
error_train, error_val = learning_curve(X, Y, Xval, Yval, 8, 0.001, lambda_, 5000)
# Plot the learning curve
fig = plt.figure(figsize=(8,6))
plt.plot(range(1,m+1),error_train, label='Train')
plt.plot(range(1,m+1),error_val, label='Cross Validation')
fig.suptitle('Polynomial Regression Learning Curve (lambda =%f)' % lambda_)
plt.xlabel('Number of training examples', fontsize=9)
plt.ylabel('Error', fontsize=9)
plt.legend(loc=1, fontsize=9)
plt.yticks(range(0,200,50))
plt.axis((0, 13, 0, 150));
$\lambda = 100.0$
lambda_ = 100
costM0, costM, theta, test = poly_fit(X, Y, 8, 0.001, lambda_, 5000)
print("Cost(%i):" % 0)
print(costM0)
print("\nCost(%i):" % 5000)
print(costM)
print(theta)
# Plot the polynomial fit
a = np.linspace(-80,60,40)
a_poly = poly_features((a - mu)/sigma, 8)
b = test(np.c_[np.ones(a_poly.shape[0]), a_poly].T)
fig = plt.figure(figsize=(8,6))
plt.plot(a,b, ls='--')
plt.scatter(X, Y, s=40, marker='x', color='r')
fig.suptitle('Polynomial Regression Fit (lambda = %f)' % lambda_)
plt.xlabel('Change in water level (x)', fontsize=9)
plt.ylabel('Water flowing out of the dam (y)', fontsize=9)
plt.axis((-80, 80, -10, 40));
lambda_set = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0]
m = len(lambda_set)
error_train = np.zeros(m)
error_val = np.zeros(m)
# normalize val data
mu_val = Xval.mean()
sigma_val = Xval.std()
# applying polynomial features
Xpoly_val = poly_features((Xval - mu_val)/sigma_val, 8)
Xnorm_val = np.c_[np.ones(m_val), Xpoly_val] # Add intercept term to X
Xnorm_val = Xnorm_val.T
print('lambda\t\tTrain Error\tValidation Error')
for j, lambda_ in enumerate(lambda_set):
costM0, costM, theta, test = poly_fit(X, Y, 8, 0.001, lambda_, 5000)
error_train[j] = costM
error_val[j] = np.sum((test(Xnorm_val)-Yval)**2)/(2*m_val)
print('%f\t%f\t%f' % (lambda_, costM, error_val[j]))
# Plot the learning curve
fig = plt.figure(figsize=(8,6))
plt.plot(lambda_set,error_train, label='Train')
plt.plot(lambda_set,error_val, label='Cross Validation')
plt.xlabel('lambda', fontsize=9)
plt.ylabel('Error', fontsize=9)
plt.legend(loc=1, fontsize=9)
plt.xticks(range(0,11,1))
plt.yticks(range(0,51,5))
plt.axis((0, 10, 0, 50));
lambda_set = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0]
m = len(lambda_set)
m_test = Ytest.shape[0]
error_train = np.zeros(m)
error_test = np.zeros(m)
# normalize val data
mu_test = Xtest.mean()
sigma_test = Xtest.std()
# applying polynomial features
Xpoly_test = poly_features((Xtest - mu_test)/sigma_test, 8)
Xnorm_test = np.c_[np.ones(m_test), Xpoly_test] # Add intercept term to X
Xnorm_test = Xnorm_test.T
print('lambda\t\tTrain Error\tTest Error')
for j, lambda_ in enumerate(lambda_set):
costM0, costM, theta, test = poly_fit(X, Y, 8, 0.001, lambda_, 5000)
error_train[j] = costM
error_test[j] = np.sum((test(Xnorm_test)-Ytest)**2)/(2*m_test)
print('%f\t%f\t%f' % (lambda_, costM, error_test[j]))