K-means Clustering

Features of dataset:

  • eruptions - eruption time in minutes
  • waiting - waiting time to next eruption in minutes.

  • Given the data related to eruptions we need to cluster a particular eruption.  

    Import required libraries

    # For mathematical calculation
    import numpy as np
    # For handling datasets
    import pandas as pd
    # For plotting graphs
    from matplotlib import pyplot as plt
    # Import the sklearn library for KMeans Clustering
    from sklearn.cluster import KMeans

    Import dataset

    # Import the csv file
    df = pd.read_csv('data.csv')
    print df.head()
       eruptions  waiting
    0      3.600       79
    1      1.800       54
    2      3.333       74
    3      2.283       62
    4      4.533       85

    Train the model

    # Assign the number of clusters
    k = 2
    kmeans = KMeans(n_clusters=k)
    # Train the model
    kmeans =
    # array that contains cluster number 
    labels = kmeans.labels_
    # array of size k with co-ordinates of 
    # centroids 
    centroids = kmeans.cluster_centers_

    Test the model

    # Prepare the test data 
    x_test = [[4.671,67],[2.885,61],[1.666,90],
    #Test the model(returns the cluster number)
    prediction = kmeans.predict(x_test)
    print prediction
    [0 0 1 0 1 0]
    As value of k is 2 
    there are only two clusters 0 and 1.

    Plot the clusters.

    # Plot the points representing their cluster
    # cluster number 
    colors = ['blue','red','green','black']
    y = 0
    for x in labels:
        # plot the points acc to their clusters
        # and assign different colors
        plt.scatter(df.iloc[y,0], df.iloc[y,1]
    for x in range(k):
        #plot the centroids
        lines = plt.plot(centroids[x,0]
        #make the centroid larger    
    title = ('No of clusters (k) = {}').format(k)
    plt.xlabel('eruptions (mins)')
    plt.ylabel('waiting (mins)')   


