Homework 7

Author: Daniel McNeela

Problem 1

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.io import loadmat

%matplotlib inline
In [2]:
data = loadmat('data/mnist_data/images.mat')
X = data['images']

Here I implement Lloyd's algorithm...

In [3]:
def lloyds(X, K):
    # Create a dictionary to hold the clusters
    cluster_nums = [j for j in range(K)]
    clusters = dict((j, []) for j in cluster_nums)
    means = dict((j, 0) for j in cluster_nums)
    # Assign random initial clusters
    for i in range(X.shape[2]):
        rand_cluster = np.random.randint(K)
        clusters[rand_cluster].append(X[:, :, i])
    # Begin algorithm
    iteration = 0
    while True:
        for k in clusters:
            mean = np.zeros((28, 28))
            for point in clusters[k]:
                mean += point
            mean *= 1 / len(clusters[k])
            means[k] = mean
        new_clusts = dict((j, []) for j in cluster_nums)
        for i in range(X.shape[2]):
            k = compute_MSE(X[:, :, i], means)
            new_clusts[k].append(X[:, :, i])
        truth_vals = [np.array_equal(clusters[k], new_clusts[k]) for k in clusters]
        if not False in truth_vals:
            return clusters, means
        clusters = new_clusts
        iteration += 1
        
def compute_MSE(X, means):
    min_diff = (0, (X - means[0]) @ (X - means[0]))
    for m in means:
        diff = (m, (X - means[m]) @ (X - means[m]))
        if np.linalg.norm(diff[1]) < np.linalg.norm(min_diff[1]):
            min_diff = diff
    return min_diff[0]

Now I run Lloyd's for $k = 5, 10, 20$ and visualize the cluster centers. For $k = 10$ we more or less have a different digit for each cluster. Each digit clusters with itself. With $k = 5$ we only have half as many representatives, so we get clusters consisting of mainly two digits. For $k = 20$, we have roughly two clusters for each digit, so each image is placed in one of those two clusters based on whichever image it resembles most closely.

In [4]:
clusters5, means5 = lloyds(X, 5)
In [11]:
for center in means5:
    print("Ceter for cluster %d" % center)
    plt.figure()
    plt.imshow(means5[center])
Ceter for cluster 0
Ceter for cluster 1
Ceter for cluster 2
Ceter for cluster 3
Ceter for cluster 4
In [12]:
clusters10, means10 = lloyds(X, 10)
In [16]:
for center in means10:
    print("Center for cluster %d" % center)
    plt.figure()
    plt.imshow(means10[center])
Center for cluster 0
Center for cluster 1
Center for cluster 2
Center for cluster 3
Center for cluster 4
Center for cluster 5
Center for cluster 6
Center for cluster 7
Center for cluster 8
Center for cluster 9
In [14]:
clusters20, means20 = lloyds(X, 20)
In [15]:
for center in means20:
    print("Center for cluster %d" % center)
    plt.figure()
    plt.imshow(means20[center])
Center for cluster 0
Center for cluster 1
Center for cluster 2
Center for cluster 3
Center for cluster 4
Center for cluster 5
Center for cluster 6
Center for cluster 7
Center for cluster 8
Center for cluster 9
Center for cluster 10
Center for cluster 11
Center for cluster 12
Center for cluster 13
Center for cluster 14
Center for cluster 15
Center for cluster 16
Center for cluster 17
Center for cluster 18
Center for cluster 19

Problem 2

In [17]:
from scipy.misc import imread
from sklearn.decomposition import TruncatedSVD
In [18]:
face = imread('data/low-rank_data/face.jpg')

Here is the original face.

In [19]:
plt.imshow(face);

Here is the rank-5 approximation.

In [20]:
U, s, V = np.linalg.svd(face)
s[5:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
face5 = U @ S @ V
plt.imshow(face5);

Here is the rank-20 approximation.

In [21]:
U, s, V = np.linalg.svd(face)
s[20:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
face20 = U @ S @ V
plt.imshow(face20);

And here is the rank-100 approximation.

In [22]:
U, s, V = np.linalg.svd(face)
s[100:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
face100 = U @ S @ V
plt.imshow(face100);

Now I plot the mean squared error for the rank-i approximation, where i ranges from 1 to 100.

In [23]:
def plotMSE(face, num_approx):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    mses = []
    xs = [i for i in range(num_approx)]
    for i in range(num_approx):
        mses.append(MSE(face, i))
    ax.plot(xs, mses, 'b-')
        
def MSE(face, k):
    U, s, V = np.linalg.svd(face)
    s[k:] = 0
    S = np.zeros((330, 280))
    S[:280, :280] = np.diag(s)
    facek = U @ S @ V
    mse = np.linalg.norm(face - facek)
    return mse
In [24]:
plotMSE(face, 100)

Now, I compute the same rank approximations for the sky photo.

In [25]:
img = imread('data/low-rank_data/sky.jpg')
plt.imshow(img);
In [26]:
U, s, V = np.linalg.svd(img)
s[5:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
img5 = U @ S @ V
plt.imshow(img5);
In [27]:
U, s, V = np.linalg.svd(img)
s[20:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
img20 = U @ S @ V
plt.imshow(img20);
In [28]:
U, s, V = np.linalg.svd(img)
s[100:] = 0
S = np.zeros((330, 280))
S[:280, :280] = np.diag(s)
img100 = U @ S @ V
plt.imshow(img100);

I am unable to differentiate the rank-20 sky photo from the original, whereas with the face I need to get to the rank-100 approximation in order to not be able to differentiate it from the original. This is probably because in the sky photo, the colors run together and the level of detail is significantly less than for the face photo.

Problem 3

In [29]:
# Load in joke data
joke_data = loadmat('data/joke_data/joke_train.mat')

# Replace NaNs with 0
normalized_data = np.nan_to_num(joke_data['train'])

Here I compute the user and joke representations $U'$ and $V'$ in the following way. Let $U \Sigma V^T$ be the SVD of the data matrix. Then we have

$$U' = U \Sigma^{1/2}$$$$V' = \Sigma^{1/2} V^T$$
In [30]:
# Compute SVD of R
U, s, V = np.linalg.svd(normalized_data, full_matrices=False)

# Fix up s into matrix form
S = np.zeros((100, 100))
S[:100, :100] = np.diag(s)

# Compute User Representation
users = U @ np.sqrt(S)
print(users.shape)

# Compute Joke Representation
jokes = np.sqrt(S) @ V
print(jokes.shape)
(24983, 100)
(100, 100)
In [31]:
def plotMSE(joke, num_approx):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    mses = []
    xs = [i for i in range(num_approx)]
    for i in range(num_approx):
        mses.append(MSE(joke, i))
    ax.plot(xs, mses, 'b-')
        
def MSE(joke, k):
    U, s, V = np.linalg.svd(joke, full_matrices=False)
    s[k:] = 0
    S = np.zeros((100, 100))
    S[:100, :100] = np.diag(s)
    jokek = U @ S @ V
    mse = np.linalg.norm(joke - jokek)
    return mse

Now I plot the MSE as a function of $d$.

In [32]:
plotMSE(normalized_data, 100)
In [33]:
valid_set = np.loadtxt('data/joke_data/validation.txt', delimiter=',')
valid_set = valid_set.astype(int)
def validation_error(jokes, d):
    U, s, V = np.linalg.svd(jokes, full_matrices=False)
    s[d:] = 0
    S = np.zeros((100, 100))
    S[:100, :100] = np.diag(s)
    R = U @ S @ V
    R = np.where(R < 0, 0, 1)
    valid_preds = valid_set[:, 2]
    preds = R[valid_set[:, 0] - 1, valid_set[:, 1] - 1]
    diff = preds - valid_preds
    num_wrong = np.count_nonzero(diff)
    return num_wrong / len(diff)

def plot_validation_error(jokes):
    errors = []
    xs = [i for i in range(100)]
    for d in range(100):
        err = validation_error(jokes, d)
        errors.append(err)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(xs, errors, 'b-');

Below, I give the validation error for $d = 2, 5, 10, 20$. I also give a plot of the error as a function of $d$. $d = 9$ gave the best accuracy.

In [34]:
validation_error(normalized_data, 2)
Out[34]:
0.2948509485094851
In [35]:
validation_error(normalized_data, 5)
Out[35]:
0.2845528455284553
In [36]:
validation_error(normalized_data, 10)
Out[36]:
0.2834688346883469
In [37]:
validation_error(normalized_data, 20)
Out[37]:
0.3140921409214092
In [38]:
validation_error(normalized_data, 9)
Out[38]:
0.2785907859078591
In [39]:
plot_validation_error(normalized_data)

Derivation

Define the loss function $$L(\{u_i\}, \{v_j\}) := \sum_{(i, j) \in S} (\langle u_i, v_j \rangle - R_{ij})^2 + \lambda \sum_{i=1}^n \|u_i\|_2^2 + \lambda \sum_{j=1}^m \|v_j\|_2^2$$

First we take the derivative wrt to $u_i$ to get $$\frac{\partial L}{\partial u_i} = 2 (u_i^Tv_j - R_{ij})v_j + 2 \lambda u_i$$

To minimize, we set the derivative to zero to get \begin{align} 0 &= 2(u_i^Tvj - R{ij})v_j + 2\lambda ui \ R{ij} v_j &= u_i^T(v_j^Tv_j) + \lambda u_i \ u_i^ &= \frac{R_{ij} v_j}{|v_j|_2^2 + \lambda} \end{align*}

The equation is symmetric wrt $u$ and $v$ so we get \begin{align} v_j^ &= \frac{R_{ij} u_i}{|u_i|_2^2 + \lambda} \end{align*}

For computational purposes, I prefer to use the stochastic gradient descent updates: $$e_{ij} = R_{ij} - u_i^Tv_j$$ giving $$v_j^{(t+1)} = v_j^{(t)} + \eta(e_{ij} u_i - \lambda v_j)$$ $$u_i^{(t+1)} = u_i^{(t)} + \eta(e_{ij} v_j - \lambda u_i)$$

In [ ]:
from sklearn import preprocessing
data = joke_data['train']
indices = np.isnan(data)
data[indices] = 0
#mms = preprocessing.MinMaxScaler(feature_range=(-1, 1))
jokes = preprocessing.normalize(data)
#jokes = mms.fit_transform(data)
In [ ]:
def SGD(jokes, d, eta=.0001, lda=.0001, num_epochs=1):
    # First compute R
    U, s, V = np.linalg.svd(jokes, full_matrices=False)
    s[d:] = 0
    S = np.zeros((100, 100))
    S[:100, :100] = np.diag(s)
    R = U @ S @ V
    u = U @ np.sqrt(S)
    v = np.sqrt(S) @ V
    for k in range(num_epochs):
        for j in range(jokes.shape[1]):
            for i in range(jokes.shape[0]):
                e = jokes[i, j] - (u[i, :] @ v[j, :])
                v[j, :] += eta * (e * u[i, :] - lda * v[j, :])
        for i in range(jokes.shape[0]):
            for j in range(jokes.shape[1]):
                e = jokes[i, j] - (u[i, :] @ v[j, :])
                u[i, :] += eta * (e * v[j, :] - lda * u[i, :])
    return u @ v
In [ ]:
r = SGD(jokes, 9, num_epochs=2)
In [ ]:
rp = mms.fit_transform(r)
In [ ]:
R = np.where(r < .5, 0, 1)
valid_preds = valid_set[:, 2]
preds = R[valid_set[:, 0] - 1, valid_set[:, 1] - 1]
diff = preds - valid_preds
num_wrong = np.count_nonzero(diff)
print(num_wrong / len(diff))

Using SGD I get a validation of approximately 70%, quite similar to that obtained in part (a), but possibly with less overfitting. Below is random code I used for testing and to generate my predictions for Kaggle. My Kaggle submission scored 71.749% (username Machine Learning King Jr.).

In [ ]:
valid_set = np.loadtxt('data/joke_data/query.txt', delimiter=',')
valid_set = valid_set.astype(int)
U, s, V = np.linalg.svd(jokes, full_matrices=False)
s[4:] = 0
S = np.zeros((100, 100))
S[:100, :100] = np.diag(s)
R = U @ S @ V
R = np.where(R < .5, 0, 1)
preds = R[valid_set[:, 1] - 1, valid_set[:, 2] - 1]
ids = valid_set[:, 0]
predictions = np.hstack((ids[:, None], preds[:, None]))
print(predictions)
np.savetxt('data/joke_preds.csv', predictions, fmt='%d', delimiter=',')