In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("baseball.csv")
df = df.rename(index=str, columns={'RS': 'Runs Scored', 'RA': 'Runs Against', 'W': 'Wins'})
df.head()
Out[1]:
Team League Year Runs Scored Runs Against Wins OBP SLG BA Playoffs RankSeason RankPlayoffs G OOBP OSLG
0 ARI NL 2012 734 688 81 0.328 0.418 0.259 0 NaN NaN 162 0.317 0.415
1 ATL NL 2012 700 600 94 0.320 0.389 0.247 1 4.0 5.0 162 0.306 0.378
2 BAL AL 2012 712 705 93 0.311 0.417 0.247 1 5.0 4.0 162 0.315 0.403
3 BOS AL 2012 734 806 69 0.315 0.415 0.260 0 NaN NaN 162 0.331 0.428
4 CHC NL 2012 613 759 61 0.302 0.378 0.240 0 NaN NaN 162 0.335 0.424
In [2]:
plt.subplot(1, 2, 1)
plt.hist(df[df.Playoffs==0].Wins, bins=np.arange(40, 125, 5), color="r", ec="k")
plt.xlabel("Wins")
plt.ylabel("Count")
plt.title("Histogram of wins for teams that didn't go to playoffs")
plt.subplot(1, 2, 2)
plt.hist(df[df.Playoffs==1].Wins, bins=np.arange(40, 125, 5), color="b", ec="k")
plt.xlabel("Wins")
plt.ylabel("Count")
plt.title("Histogram of wins for teams that did go to playoffs")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Correlations

In [3]:
df = df.filter(['Runs Scored', 'Runs Against', 'Wins'])
df['Run Difference'] = df['Runs Scored']-df['Runs Against']
df.head()
Out[3]:
Runs Scored Runs Against Wins Run Difference
0 734 688 81 46
1 700 600 94 100
2 712 705 93 7
3 734 806 69 -72
4 613 759 61 -146
In [4]:
df.corr()
Out[4]:
Runs Scored Runs Against Wins Run Difference
Runs Scored 1.000000 0.380139 0.511745 0.546294
Runs Against 0.380139 1.000000 -0.532394 -0.567048
Wins 0.511745 -0.532394 1.000000 0.937851
Run Difference 0.546294 -0.567048 0.937851 1.000000

Scatter plot

In [5]:
plt.figure(figsize=(8,8))
plt.scatter(df['Run Difference'], df['Wins'])
plt.xlabel("Run Difference")
plt.ylabel("Wins")
plt.title("Scatter plot of wins vs. run difference")
Out[5]:
Text(0.5, 1.0, 'Scatter plot of wins vs. run difference')

Prediction

In [6]:
x_vals = np.array([np.min(df["Run Difference"]), np.max(df["Run Difference"])])
x_vals_standardized = (x_vals-df["Run Difference"].mean())/df["Run Difference"].std(ddof=0)
y_predictions_standardized = df.corr()["Run Difference"]["Wins"]*x_vals_standardized
y_predictions = y_predictions_standardized*df["Wins"].std(ddof=0)+df["Wins"].mean()
plt.figure(figsize=(8,8))
plt.scatter(df['Run Difference'], df['Wins'])
plt.xlabel("Run Difference")
plt.ylabel("Wins")
plt.title("Scatter plot of wins vs. run difference with prediction line")
plt.plot(x_vals, y_predictions, 'r', linewidth=5)
Out[6]:
[<matplotlib.lines.Line2D at 0x1fbdc029898>]