Data from Kaggle (https://www.kaggle.com/wduckett/moneyball-mlb-stats-19622012)
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("baseball.csv")
df = df.rename(index=str, columns={'RS': 'Runs Scored', 'RA': 'Runs Against', 'W': 'Wins'})
df.head()
plt.subplot(1, 2, 1)
plt.hist(df[df.Playoffs==0].Wins, bins=np.arange(40, 125, 5), color="r", ec="k")
plt.xlabel("Wins")
plt.ylabel("Count")
plt.title("Histogram of wins for teams that didn't go to playoffs")
plt.subplot(1, 2, 2)
plt.hist(df[df.Playoffs==1].Wins, bins=np.arange(40, 125, 5), color="b", ec="k")
plt.xlabel("Wins")
plt.ylabel("Count")
plt.title("Histogram of wins for teams that did go to playoffs")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)
df = df.filter(['Runs Scored', 'Runs Against', 'Wins'])
df['Run Difference'] = df['Runs Scored']-df['Runs Against']
df.head()
df.corr()
plt.figure(figsize=(8,8))
plt.scatter(df['Run Difference'], df['Wins'])
plt.xlabel("Run Difference")
plt.ylabel("Wins")
plt.title("Scatter plot of wins vs. run difference")
x_vals = np.array([np.min(df["Run Difference"]), np.max(df["Run Difference"])])
x_vals_standardized = (x_vals-df["Run Difference"].mean())/df["Run Difference"].std(ddof=0)
y_predictions_standardized = df.corr()["Run Difference"]["Wins"]*x_vals_standardized
y_predictions = y_predictions_standardized*df["Wins"].std(ddof=0)+df["Wins"].mean()
plt.figure(figsize=(8,8))
plt.scatter(df['Run Difference'], df['Wins'])
plt.xlabel("Run Difference")
plt.ylabel("Wins")
plt.title("Scatter plot of wins vs. run difference with prediction line")
plt.plot(x_vals, y_predictions, 'r', linewidth=5)