In [1]:
import matplotlib.pyplot as plt
%matplotlib notebook
import pandas as pd
import numpy as np
df = pd.read_excel("BodyFat.xls", usecols=[4, 5, 6])
df.head()
Out[1]:
WEIGHT HEIGHT ADIPOSITY
0 154.25 67.75 23.7
1 173.25 72.25 23.4
2 154.00 66.25 24.7
3 184.75 72.25 24.9
4 184.25 71.25 25.6

Interactive 3D scatter plot

In [11]:
from mpl_toolkits.mplot3d import Axes3D
%matplotlib notebook
scatter3d = plt.figure().gca(projection='3d')
scatter3d.scatter(df.WEIGHT, df.HEIGHT, df.ADIPOSITY)
scatter3d.set_xlabel('Weight')
scatter3d.set_ylabel('Height')
scatter3d.set_zlabel('Adiposity')
plt.show()

Scatter plot matrix

In [12]:
import seaborn as sns
sns.pairplot(df)
Out[12]:
<seaborn.axisgrid.PairGrid at 0x2c336a81358>

Mean, covariance and correlation coefficient

In [13]:
df.mean()
Out[13]:
WEIGHT       178.924405
HEIGHT        70.148810
ADIPOSITY     25.436905
dtype: float64
In [14]:
df.cov()
Out[14]:
WEIGHT HEIGHT ADIPOSITY
WEIGHT 863.722719 33.185647 95.137383
HEIGHT 33.185647 13.416513 -0.332605
ADIPOSITY 95.137383 -0.332605 13.308712
In [15]:
df.corr()
Out[15]:
WEIGHT HEIGHT ADIPOSITY
WEIGHT 1.000000 0.308279 0.887352
HEIGHT 0.308279 1.000000 -0.024891
ADIPOSITY 0.887352 -0.024891 1.000000

Principal components analysis on standardized data

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
df_no_outlier = df[df.HEIGHT > 40] # remove height outlier
data_standardized = StandardScaler().fit_transform(df_no_outlier)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(data_standardized)
df_principal = pd.DataFrame(data = principal_components, columns = ['pc1', 'pc2', 'pc3'])
%matplotlib notebook
scatter3d = plt.figure().gca(projection='3d')
scatter3d.scatter(df_principal.pc1, df_principal.pc2, df_principal.pc3)
scatter3d.set_xlabel('principal component 1')
scatter3d.set_ylabel('principal component 2')
scatter3d.set_zlabel('principal component 3')
plt.show()
In [18]:
df_principal.cov()
Out[18]:
pc1 pc2 pc3
pc1 2.038862e+00 -2.079594e-17 4.709144e-18
pc2 -2.079594e-17 9.688595e-01 5.254716e-18
pc3 4.709144e-18 5.254716e-18 4.278847e-03

Projection of data to 2 dimensions

In [19]:
df_projected = df_principal
df_projected.pc3 = 0 # zero out principal component 3
%matplotlib notebook
scatter3d = plt.figure().gca(projection='3d')
scatter3d.scatter(df_projected.pc1, df_projected.pc2, df_projected.pc3)
scatter3d.set_xlabel('principal component 1')
scatter3d.set_ylabel('principal component 2')
scatter3d.set_zlabel('principal component 3')
plt.show()