In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("121278.csv")

CS 361 students answered the survey below.

Q1: What's your name? (student names removed for privacy)

Q2: We will roll a 6-sided die. Guess how many rolls will it take to see the first 6.

Q3: If the prize were $100, what's the most you would bet (in dollars) to play this game?

Q4: Have you ever been to Las Vegas?

Mean, standard deviation and variance

In [2]:
plt.subplot(1, 2, 1)
plt.hist(df["Q2:Rolls"], bins=np.arange(0, 101, 10), color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of rolls")
plt.subplot(1, 2, 2)
plt.hist(df["Q3:Bet"], bins=np.arange(0, 100000001, 10000000), color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)
In [3]:
print("ROLLS")
print("Mean:", df["Q2:Rolls"].mean())
print("Standard deviation:", df["Q2:Rolls"].std(ddof=0))
print("Variance:", df["Q2:Rolls"].var())
print("BETS")
print("Mean:", df["Q3:Bet"].mean())
print("Standard deviation:", df["Q3:Bet"].std(ddof=0))
print("Variance:", df["Q3:Bet"].var())
ROLLS
Mean: 6.603305785123967
Standard deviation: 11.555456897635292
Variance: 134.64132231404963
BETS
Mean: 826927.3379889808
Standard deviation: 9053222.917558927
Variance: 82643852238305.94

Standard coordinates

In [4]:
rolls_standardized = (df["Q2:Rolls"]-df["Q2:Rolls"].mean())/df["Q2:Rolls"].std(ddof=0)
bets_standardized = (df["Q3:Bet"]-df["Q3:Bet"].mean())/df["Q3:Bet"].std(ddof=0)
plt.subplot(1, 2, 1)
plt.hist(rolls_standardized, bins=10, color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of standardized rolls")
plt.subplot(1, 2, 2)
plt.hist(bets_standardized, bins=10, color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of standardized bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Median and interquartile range

In [5]:
print("ROLLS")
print("Median:", df["Q2:Rolls"].median())
print("Interquartile range:", stats.iqr(df["Q2:Rolls"]))
print("BETS")
print("Median:", df["Q3:Bet"].median())
print("Interquartile range:", stats.iqr(df["Q3:Bet"]))
ROLLS
Median: 4.0
Interquartile range: 3.0
BETS
Median: 10.0
Interquartile range: 10.0

Box plots and outliers

In [6]:
plt.subplot(1, 2, 1)
plt.boxplot(df["Q2:Rolls"], labels=["rolls"])
plt.title("Box plot of rolls")
plt.subplot(1, 2, 2)
plt.boxplot(df["Q3:Bet"], labels=["bet"])
plt.title("Box plot of bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Removing outliers

In [7]:
df_no_outliers = df[df["Q2:Rolls"] <= np.percentile(df["Q2:Rolls"],75)+1.5*stats.iqr(df["Q2:Rolls"])]
df_no_outliers = df_no_outliers[df_no_outliers["Q3:Bet"] <= np.percentile(df["Q3:Bet"],75)+1.5*stats.iqr(df["Q3:Bet"])]
df_no_outliers.count()
Out[7]:
Q2:Rolls    98
Q3:Bet      98
Q4:Vegas    98
dtype: int64
In [8]:
plt.subplot(1, 2, 1)
plt.hist(df_no_outliers["Q2:Rolls"], bins=np.arange(0.5, 9.5, 1), color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of rolls, outliers removed")
plt.subplot(1, 2, 2)
plt.hist(df_no_outliers["Q3:Bet"], bins=8, color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of bet, outliers removed")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Sensitivity of summary statistics to outliers

In [9]:
print("ROLLS")
print("Mean:", df_no_outliers["Q2:Rolls"].mean(), "(used to be", df["Q2:Rolls"].mean(), ")")
print("Standard deviation:", df_no_outliers["Q2:Rolls"].std(ddof=0), "(used to be", df["Q2:Rolls"].std(ddof=0), ")")
print("Median:", df_no_outliers["Q2:Rolls"].median(), "(used to be", df["Q2:Rolls"].median(), ")")
print("Interquartile range:", stats.iqr(df_no_outliers["Q2:Rolls"]), "(used to be", stats.iqr(df["Q2:Rolls"]), ")")
print("BETS")
print("Mean:", df_no_outliers["Q3:Bet"].mean(), "(used to be", df["Q3:Bet"].mean(), ")")
print("Standard deviation:", df_no_outliers["Q3:Bet"].std(ddof=0), "(used to be", df["Q3:Bet"].std(ddof=0), ")")
print("Median:", df_no_outliers["Q3:Bet"].median(), "(used to be", df["Q3:Bet"].median(), ")")
print("Interquartile range:", stats.iqr(df_no_outliers["Q3:Bet"]), "(used to be", stats.iqr(df["Q3:Bet"]), ")")
ROLLS
Mean: 4.153061224489796 (used to be 6.603305785123967 )
Standard deviation: 2.071931174573598 (used to be 11.555456897635292 )
Median: 4.0 (used to be 4.0 )
Interquartile range: 3.0 (used to be 3.0 )
BETS
Mean: 10.825476190510201 (used to be 826927.3379889808 )
Standard deviation: 6.640052305133587 (used to be 9053222.917558927 )
Median: 10.0 (used to be 10.0 )
Interquartile range: 11.0 (used to be 10.0 )

Visualizing relationships in data

Stock trading

Data from Yahoo Finance (https://finance.yahoo.com/quote/FDX/history and https://finance.yahoo.com/quote/UPS/history)

In [10]:
fdx = pd.read_csv("FDX.csv").rename(index=str, columns={"Adj Close": "FDX"}).filter(['Date','FDX'])
ups = pd.read_csv("UPS.csv").rename(index=str, columns={"Adj Close": "UPS"}).filter(['UPS'])
stock_data = pd.concat([fdx, ups], axis=1, sort=False)
stock_data['Date'] = pd.to_datetime(stock_data['Date'], infer_datetime_format=True)
stock_data.tail()
Out[10]:
Date FDX UPS
247 2019-01-09 170.589996 97.919998
248 2019-01-10 170.119995 98.910004
249 2019-01-11 170.990005 97.910004
250 2019-01-14 171.550003 99.139999
251 2019-01-15 170.630005 97.559998

Plotting time series data

In [11]:
plt.figure(figsize=(12,5))
plt.plot(stock_data.FDX, color="mediumorchid", linestyle="--", label='FDX')
plt.plot(stock_data.UPS, color="chocolate", linestyle="-", label='UPS')
plt.legend()
plt.xticks([0, 251], ["Jan 16 2018", "Jan 15 2019"])
plt.ylabel("Price in USD")
plt.title("Daily stock prices 1/16/2018-1/15/2019: FDX and UPS")
Out[11]:
Text(0.5, 1.0, 'Daily stock prices 1/16/2018-1/15/2019: FDX and UPS')

Standardization

In [12]:
stock_data_standardized = stock_data.copy()
stock_data_standardized.FDX = (stock_data.FDX-stock_data.FDX.mean())/stock_data.FDX.std(ddof=0)
stock_data_standardized.UPS = (stock_data.UPS-stock_data.UPS.mean())/stock_data.UPS.std(ddof=0)
In [13]:
plt.figure(figsize=(12,5))
plt.plot(stock_data_standardized.FDX, color="mediumorchid", linestyle="--", label='FDX')
plt.plot(stock_data_standardized.UPS, color="chocolate", linestyle="-", label='UPS')
plt.legend()
plt.xticks([0, 251], ["Jan 16 2018", "Jan 15 2019"])
plt.ylabel("Standardized Price")
plt.title("Standardized daily stock prices 1/16/2018-1/15/2019: FDX and UPS")
Out[13]:
Text(0.5, 1.0, 'Standardized daily stock prices 1/16/2018-1/15/2019: FDX and UPS')

Scatter plots

In [14]:
plt.subplot(1, 2, 1)
plt.scatter(stock_data.FDX, stock_data.UPS, color="red")
plt.title("Scatter plot of UPS vs FDX stock prices 1/16/2018-1/15/2019")
plt.xlabel("FDX stock price (USD)")
plt.ylabel("UPS stock price (USD)")
plt.subplot(1, 2, 2)
plt.scatter(stock_data_standardized.FDX, stock_data_standardized.UPS, color="blue")
plt.title("Scatter plot of standardized UPS vs FDX stock prices 1/16/2018-1/15/2019")
plt.xlabel("FDX stock price (USD)")
plt.ylabel("UPS stock price (USD)")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Correlation coefficient

In [15]:
print(stock_data.corr())
          FDX       UPS
FDX  1.000000  0.693917
UPS  0.693917  1.000000

Back to Rolls and Bets

In [16]:
plt.subplot(1, 2, 1)
plt.scatter(df["Q2:Rolls"], df["Q3:Bet"], color="red")
plt.title("Scatter plot of Bet vs Rolls")
plt.xlabel("Rolls")
plt.ylabel("Bet")
plt.subplot(1, 2, 2)
plt.scatter(df_no_outliers["Q2:Rolls"], df_no_outliers["Q3:Bet"], color="blue")
plt.title("Scatter plot of Bet vs Rolls, outliers removed")
plt.xlabel("Rolls")
plt.ylabel("Bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)
In [17]:
print("WITH OUTLIERS")
print(df.corr())
print("WITHOUT OUTLIERS")
print(df_no_outliers.corr())
WITH OUTLIERS
          Q2:Rolls    Q3:Bet
Q2:Rolls  1.000000  0.737814
Q3:Bet    0.737814  1.000000
WITHOUT OUTLIERS
          Q2:Rolls    Q3:Bet
Q2:Rolls  1.000000 -0.125096
Q3:Bet   -0.125096  1.000000