In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.read_csv("iris.csv", header=None, names=['sepal length','sepal width','petal length','petal width', 'species'])
df.head()
Out[1]:
sepal length sepal width petal length petal width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

Scatter plot matrix with different markers

In [2]:
sns.pairplot(df, hue="species", markers=['D', 'o', 's'])
Out[2]:
<seaborn.axisgrid.PairGrid at 0x1be314fadd8>

Covariance matrix and its eigenvalues

In [3]:
from sklearn.preprocessing import StandardScaler
# Separating out the features
X = df.loc[:, ['sepal length', 'sepal width', 'petal length', 'petal width']].values
# Standardizing the features
X = StandardScaler().fit_transform(X)
# Covariance matrix
cov_mat = np.cov(X.T)
cov_mat
Out[3]:
array([[ 1.00671141, -0.11010327,  0.87760486,  0.82344326],
       [-0.11010327,  1.00671141, -0.42333835, -0.358937  ],
       [ 0.87760486, -0.42333835,  1.00671141,  0.96921855],
       [ 0.82344326, -0.358937  ,  0.96921855,  1.00671141]])
In [4]:
# Obtain eigenvalues and eigenvectors
eigenvals, eigenvecs = np.linalg.eig(cov_mat)
# Sort eigenvalues and eigenvectors in order of decreasing eigenvalue
idx = np.argsort(eigenvals)[::-1]
eigenvals = eigenvals[idx]
eigenvecs = eigenvecs[:,idx]
# Plot eigenvalues in decreasing order
plt.plot(np.arange(1,5), eigenvals, 'ro-')
plt.title('Eigenvalues in sorted order')
eigenvals
Out[4]:
array([2.93035378, 0.92740362, 0.14834223, 0.02074601])

Projection on to the first two principal components

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
df_pca = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
df_pca.head()
Out[5]:
principal component 1 principal component 2
0 -2.264542 0.505704
1 -2.086426 -0.655405
2 -2.367950 -0.318477
3 -2.304197 -0.575368
4 -2.388777 0.674767
In [6]:
df_joined = pd.concat([df_pca, df.species], axis = 1)
df_joined.head()
Out[6]:
principal component 1 principal component 2 species
0 -2.264542 0.505704 Iris-setosa
1 -2.086426 -0.655405 Iris-setosa
2 -2.367950 -0.318477 Iris-setosa
3 -2.304197 -0.575368 Iris-setosa
4 -2.388777 0.674767 Iris-setosa
In [8]:
sns.lmplot('principal component 1', 'principal component 2', data=df_joined, fit_reg=False, hue="species", markers=['D','o','s'])
Out[8]:
<seaborn.axisgrid.FacetGrid at 0x1be36cf6240>