Project 3

B-IT Pattern Recognition

Presented on 4-Feb-2016 by:

  • Abdullah Abdullah

  • Can Güney Aksakallı

  • Kang Cifong

  • Umut Hatipoğlu


In [1]:
import numpy as np
import timeit
import time
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from sklearn.cluster import KMeans
from scipy.cluster.vq import kmeans, kmeans2

import pattrex.plotting_mpl as plt_rex
import pattrex.dimred as dim_rex
import pattrex.fun_with_k_means as km_rex
import pattrex.SpectralClustering as sc_rex
import pattrex.SpectralClustering_AndrewNg as scan_rex
from pattrex.demo_helper import read_whdata
import pattrex.preprocessing as pre_rex

Task 3.1

Fun with k-means clustering


In [3]:
def demo_1_data():
    data = np.genfromtxt("./data/data-clustering-1.csv", delimiter=',')
    print("{} samples of {} dimensional data".format(*(data.T).shape))
    
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111)
    
    plt_rex.plot2d(data, colwise_data=True, show=False, hatch='k.', 
                   title="Data", axs=ax)
    
    return data.T

The Data

In [6]:
mydata = demo_1_data()
200 samples of 2 dimensional data
In [4]:
def demo_1_init(data, k, seed=9000):
    nX, mX = data.shape
    np.random.seed(seed)
    
    # Random choices from data
    m1 = np.copy(data[np.random.choice(np.arange(nX), size=k)])
    
    np.random.seed(seed + seed)
    
    # Random choice of one
    m2 = np.copy(data[np.random.choice(np.arange(nX), size=k)])
    
    # explicit init
    m3 = np.array([
            [2, 2],
            [0, 2],
            [2, 0]
        ])
    
    return m1, m2, m3
In [5]:
def demo_1_lloyd(data, k):
    inits = demo_1_init(data, k, seed=800)
    titles = [
        "Lloyd's - {} random choices to init 1".format(k),
        "Lloyd's - {} random choices to init 2".format(k),
        "Lloyd's - human init",
    ]
    fig = plt.figure(figsize=(15, 5))
    sp = [1, 3, 0]
    
    for i, t in zip(inits, titles):
        try:
            m, l = km_rex.lloyd2(data, i, verbose=True)
        except UserWarning:
            print("Did not converge for {}".format(t))
        
        # plotting
        sp[-1] += 1
        ax = fig.add_subplot(*sp)
        
        h_d = ['r.', 'g.', 'b.']
        h_m = ['rs', 'gs', 'bs']
        h_i = ['ko', 'ko', 'ko']
        for c, hd, hm, hi in zip(range(k), h_d , h_m, h_i):
            plt_rex.plot2d(data[l == c], False, show=False, axs=ax,
                           hatch=hd, 
                           title=t)
            plt_rex.plot2d(m[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hm)
            plt_rex.plot2d(i[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hi)
            
def demo_1_hart(data, k, seeds):
    titles = ["Hartigan's - Random Seed {}".format(s) for s in seeds]
    fig = plt.figure(figsize=(15, 5))
    sp = [1, 3, 0]
    
    for s, t in zip(seeds, titles):
        try:
            m, l = km_rex.hartigan2(data, k, seed=s)
        except UserWarning:
            print("Did not converge for {}".format(t))
        
        # plotting
        sp[-1] += 1
        ax = fig.add_subplot(*sp)
        
        h_d = ['r.', 'g.', 'b.']
        h_m = ['rs', 'gs', 'bs']
#         h_i = ['ko', 'ko', 'ko']
        for c, hd, hm in zip(range(k), h_d , h_m):
            plt_rex.plot2d(data[l == c], False, show=False, axs=ax,
                           hatch=hd, 
                           title=t)
            plt_rex.plot2d(m[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hm)
#             plt_rex.plot2d(i[c, :].reshape(1, m.shape[1]), False, show=False, 
#                            axs=ax, hatch=hi)

def demo_1_macqueen(data, k, seeds):
    titles = ["MacQueen's"] + \
        ["MacQueen's - Random shuffle {}".format(s) for s in seeds]
    fig = plt.figure(figsize=(15, 5))
    sp = [1, 3, 0]
    
    datas = [np.copy(data)]
    for s in seeds:
        np.random.seed(s)
        np.random.shuffle(data)
        datas.append(np.copy(data))
        
    
    for d, t in zip(datas, titles):
        try:
            m, l = km_rex.mcqueen2(d, k)
        except UserWarning:
            print("Did not converge for {}".format(t))
        
        # plotting
        sp[-1] += 1
        ax = fig.add_subplot(*sp)
        
        h_d = ['r.', 'g.', 'b.']
        h_m = ['rs', 'gs', 'bs']
        h_i = ['ko', 'ko', 'ko']
        i = d[:k, :]
        for c, hd, hm, hi in zip(range(k), h_d , h_m, h_i):
            plt_rex.plot2d(d[l == c], False, show=False, axs=ax,
                           hatch=hd, 
                           title=t)
            plt_rex.plot2d(m[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hm)
            plt_rex.plot2d(i[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hi)

Lloyd's Algorithm

  • Very Sensitive to initialization values

  • Converges, but no guarantees (esp in case of bad initializations)

  • No Guarantee about the results either

  • Really Fast (if no catastrophy)

In [6]:
demo_1_lloyd(mydata, 3)
Converged after 4 iterations
Converged after 9 iterations
Converged after 3 iterations
In [7]:
def demo_1_lloyd2(data, k, dist, seed):
    i = demo_1_init(data, k, seed=seed)[0]
    titles = [
        "Lloyd's - random choices to init - {}".format(d) for d in dist]
    fig = plt.figure(figsize=(15, 5))
    sp = [1, 3, 0]
    
    for d, t in zip(dist, titles):
        try:
            m, l = km_rex.lloyd2(data, i, verbose=True, metric=d)
        except UserWarning:
            print("Did not converge for {}".format(t))
        
        # plotting
        sp[-1] += 1
        ax = fig.add_subplot(*sp)
        
        h_d = ['r.', 'g.', 'b.']
        h_m = ['rs', 'gs', 'bs']
        h_i = ['ko', 'ko', 'ko']
        for c, hd, hm, hi in zip(range(k), h_d , h_m, h_i):
            plt_rex.plot2d(data[l == c], False, show=False, axs=ax,
                           hatch=hd, 
                           title=t)
            plt_rex.plot2d(m[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hm)
            plt_rex.plot2d(i[c, :].reshape(1, m.shape[1]), False, show=False, 
                           axs=ax, hatch=hi)

Different Similarity Measures

  • The data does seem to have Gaussian Blobs

    • The problem with the data is different
  • Different similarity metric will probably not give different results

    • Except in case of relatively bad similarity metrics
In [50]:
demo_1_lloyd2(mydata, 3, ['euclidean', 'cityblock', 'mahalanobis'], 800)
demo_1_lloyd2(mydata, 3, ['euclidean', 'cityblock', 'mahalanobis'], 10)
Converged after 5 iterations
Converged after 6 iterations
Converged after 5 iterations
Converged after 5 iterations
Converged after 4 iterations
Converged after 5 iterations

Hartigan's Algorithm

  • Converges quickly

  • Relatively Robust

  • Still sensitive to initialization of classes

In [10]:
demo_1_hart(mydata, 3, [12, 42, 999])  # These took some time to choose

Smarter Way?

  • We couldn't figure out any smarter way, rather than :

    • only recalculate objective function for the current class

      • Not reliable, esp when the data is disproportionate among classes
    • calculate the change in objective function incrementally

    • Halved the number of data points for which the distance is calculated, compared to Naive

      • Does not fully utilize the potential, eg vectorization

m[ki] = (n[ki] / (n[ki] - 1)) * (m[ki] - x / n[ki])

normxkk = norm(x - m, axis=1)
eki = np.sum(norm(Xki - m[ki])) - normxkk[ki]
ediffki = eki - e[ki]

ediff = ediffki + normxkk

kw = np.argmin(ediff)

MacQueen's Algorithm

  • Convenient for streams

  • Sensitive to order of data

    • Essentially, still sensitive to initialization
In [51]:
demo_1_macqueen(mydata, 3, [12, 556])
In [17]:
def demo_1_t():
    print("Mac OSX - 10.11.3")
    print("2,9 GHz Intel Core i7 (On Battery)")
    print("Python 3.4")
    print("\nLloyd - Naive")
    %timeit km_rex.kmeans_Lloyd(mydata, 3, init_c, save_plot=False)
    
    print("\nLloyd - 2")
    %timeit km_rex.lloyd2(mydata, init_c, verbose=False, metric='c')
    
    print("\nLloyd - sklearn.cluster.KMeans")
    %timeit KMeans(n_clusters=3, init=init_c).fit(mydata)
    
    print("\n?? - scipy.cluster.vq.kmeans")
    %timeit kmeans(mydata, init_c, check_finite=False)
    
    print("\n?? - scipy.cluster.vq.kmeans2")
    %timeit kmeans2(mydata, init_c, minit='points', check_finite=False)
    
    print("\nHartigan - Naive")
    %timeit km_rex.kmeans_hartigans(mydata, 3, save_plot=False, show_plot=False)
    
    print("\nHartigan - 2")
    %timeit km_rex.hartigan2(mydata, 3, seed=9000)
    
    print("\nMacQueen - Naive")
    %timeit km_rex.kmeans_macqueen(mydata, 3, save_plot=False)
    
    print("\nMacQueen - 2 (numpy-ed)")
    %timeit km_rex.mcqueen2(mydata, 3)
    
    t = [
        (58e-3, "LN"),
        (964e-6, "L2"),
        (1.87e-3, "Lsk"),
        (693e-6, "?sp"),
        (827e-6, "?sp2"),
        (352e-3, "HN"),
        (50.8e-3, "H2"),
        (17.8e-3, "MN"),
        (9.13e-3, "M2")
    ]
    
    tt = [t_[0] for t_ in t]
    tl = [t_[1] for t_ in t]
    
    fig, ax = plt.subplots()
    ax.bar(np.arange(len(tt)), tt)
    ax.set_xticklabels(tl)
    

Run times!

In [18]:
np.random.seed(9000)
init_c = np.copy(mydata[np.random.choice(np.arange(mydata.shape[0]), size=3)])
demo_1_t()
Mac OSX - 10.11.3
2,9 GHz Intel Core i7 (On Battery)
Python 3.4

Lloyd - Naive
10 loops, best of 3: 57.9 ms per loop

Lloyd - 2
1000 loops, best of 3: 972 µs per loop

Lloyd - sklearn.cluster.KMeans
1000 loops, best of 3: 1.82 ms per loop

?? - scipy.cluster.vq.kmeans
1000 loops, best of 3: 683 µs per loop

?? - scipy.cluster.vq.kmeans2
1000 loops, best of 3: 796 µs per loop

Hartigan - Naive
1 loops, best of 3: 332 ms per loop

Hartigan - 2
10 loops, best of 3: 52 ms per loop

MacQueen - Naive
100 loops, best of 3: 17.2 ms per loop

MacQueen - 2 (numpy-ed)
100 loops, best of 3: 8.88 ms per loop
/Users/myrmidon/.conda/envs/pattrex/lib/python3.4/site-packages/sklearn/cluster/k_means_.py:794: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  n_jobs=self.n_jobs)

Task 3.2

Spectral Clustering


Syllabus

  • Apply K-means

  • Apply Spectral Clustering

  • Apply Spectral Clustering using Andrew Ng's Alorithm

In [19]:
def demo_2_1():
    my_data = np.genfromtxt('data/data-clustering-2.csv', delimiter=',')
    x = my_data[0,:]
    y = my_data[1,:]

    # plotting
    fig = plt.figure(figsize=(12, 8))
    axs = fig.add_subplot(111)

    plt_rex.plot2d(my_data, colwise_data=True, hatch='bo', 
                  show=False, axs=axs, set_aspect_equal=False, 
                   title="data")
    
    return my_data
In [20]:
my_data = demo_2_1()

Apply K-means on Data

In [21]:
def demo_2_2(my_data):
    k = 2
    data = my_data.T
    centroidsInit = np.array([[1, 2], [3, 4]])
    centroids, idx = km_rex.lloyd2(data, centroidsInit, verbose=False)
    # idx, _ = km_rex.vq(data, centroids)
    km_rex.show_plotted_cluster(data, idx, centroids, "Lloyd's algorithm",k)

    # Hartigan's algorithm
    centroids, idx = km_rex.hartigan2(data, k)
    km_rex.show_plotted_cluster(data, idx, centroids, "Hartigan's algorithm",k)

    # MacQueen's algorithm
    centroids, idx = km_rex.mcqueen2(data, k)
    km_rex.show_plotted_cluster(data, idx, centroids, "MacQueen's algorithm",k)
In [22]:
demo_2_2(my_data)

Apply Spectral Clustering on Data

  • Get a good result at beta = 11
  • By observation, we see that some edges points would be mis-judged as beta grows from 1 to 15
  • The Upper half contains 100 points, and so is the lower half.

Play around the number of halfs

  • See how the number of halfs changes
In [23]:
def demo_2_4(my_data):
    for i in np.arange(1,20,1):
        ur,index,u_idx_pos,u_idx_neg = sc_rex.SpectralClustering(my_data, i)
        if(len(u_idx_pos[0])>=(len(u_idx_neg[0]))):
            print(i,len(u_idx_pos[0]))
        else:
            print(i,len(u_idx_neg[0]))
        if(len(u_idx_pos[0])==(len(u_idx_neg[0]))):
            print("Got",i," with 100 each")
            sc_rex.plot(my_data,ur,index,u_idx_pos,u_idx_neg)
In [24]:
demo_2_4(my_data)
1 103
2 103
3 101
4 100
Got 4  with 100 each
5 100
Got 5  with 100 each
6 101
7 101
8 109
9 124
10 106
11 103
12 104
13 114
14 118
15 120
16 122
17 122
18 122
19 107

Exam the Laplacian Matrix

  • S = exp(-beta* |x_i-x_j|^2) which is indepandent on the data order
  • D = Sum(j to n)(Sij) if i=j which is depandent on the data order
  • L = D - S

Shuffle the data order to see the result

  • we would have a differnet beta or even unable to get one sometime. Sometimes we got a lot
  • But we see that the upper half gathered close to y=0 line, while the lower half spread around.
In [25]:
def demo_2_5(my_data):
    idx = np.arange(0,200,1)
    np.random.shuffle(idx)
    
    for i in np.arange(1,20,1):
        ur,index,u_idx_pos,u_idx_neg = sc_rex.SpectralClustering(my_data[:,idx], i)
        if(len(u_idx_pos[0])==(len(u_idx_neg[0]))):
            print("Got",i)
        sc_rex.plot(my_data[:,idx],ur,index,u_idx_pos,u_idx_neg)
In [26]:
demo_2_5(my_data)
Got 3
Got 4