{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"stochastic_gradient_descent_demo.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]},"kernelspec":{"name":"python2","display_name":"Python 2"}},"cells":[{"metadata":{"id":"MQRJUzlI9y7P","colab_type":"code","colab":{}},"cell_type":"code","source":["import random\n","import math\n","\n","import numpy as np\n","import matplotlib.pyplot as plt\n","from matplotlib import animation, rc\n","from IPython.display import HTML\n"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Sk233h_C8NTw","colab_type":"code","colab":{}},"cell_type":"code","source":["# Generate some fake data.\n","\n","def f(x, a=0, b=0):\n"," return np.multiply(a, x) + b\n","\n","real_a, real_b = (0.7, 0.3)\n","\n","x = np.random.uniform(0.1, 4.9, size=25)\n","y = f(x, a=real_a, b=real_b) + np.random.uniform(-0.025, 0.025, size=25)\n"],"execution_count":0,"outputs":[]},{"metadata":{"id":"Td5uxvLeBO8C","colab_type":"text"},"cell_type":"markdown","source":["**My Cost Function** (it is different from SVM cost)\n","\n","$cost(x, y) = \\frac{1}{N}\\sum (y-(a^T x+b))^2 + \\lambda a^T a$\n","\n","**Gradients with respect to a,b**\n","\n","$\\nabla_a cost(x, y) = \\left [\\frac{1}{N}\\sum -2x \\cdot (y-(a^T x+b)) \\right ] + \\lambda a$\n","\n","$\\nabla_b cost(x, y) = \\left [\\frac{1}{N}\\sum -2 \\cdot (y-(a^T x+b)) \\right ]$"]},{"metadata":{"id":"QeU8owBV94kg","colab_type":"code","colab":{}},"cell_type":"code","source":["# x: feature vector\n","# y: true label value\n","# a, b: learned parameters\n","#\n","# Returns: Cost of 1/N sum [ (y - (ax+b))^2]\n","#\n","# This is not the SVM cost function. It is simple linear regression but\n","# conceptually gradient descent will work the same way.\n","#\n","# This code assumes a & x are scalars, i.e. we have only a single feature.\n","# Everything below would need to be slightly rewriten to handle multiple\n","# features.\n","def cost_function(x, y, a, b):\n"," return np.mean( np.square( y - f(x, a, b) ))\n"," \n","\n","# x: feature vector\n","# y: true label value\n","# a, b: current estimate of learned parameters\n","def gradient_of_cost(x, y, a, b, lam=0):\n"," # Gradient w.r.t. a = Cost of 1/N sum [ (y - (ax+b))^2]\n"," gradient_a = np.mean( np.multiply(-2, np.multiply((y - f(x, a, b)), x))) + np.multiply(lam, a)\n"," gradient_b = np.mean( np.multiply(-2, (y - f(x, a, b)) ))\n"," \n"," return [gradient_a, gradient_b]"],"execution_count":0,"outputs":[]},{"metadata":{"id":"RElfaRflJYdt","colab_type":"code","cellView":"form","colab":{}},"cell_type":"code","source":["#@title Animation code. Totally unrelated to SGD and SVMs.\n","# \n","\n","def animate(all_a_est, all_b_est, all_costs):\n"," # Uses globals x, y, f, cost_function, etc. Bad. \n"," \n"," a_range = (np.min(all_a_est)-0.1, np.max(all_a_est)+0.1)\n"," b_range = (np.min(all_b_est)-0.1, np.max(all_b_est)+0.1)\n"," \n"," costs_range = (np.min(all_costs), np.max(all_costs)*1.05)\n"," \n"," # First set up the figure, the axes, and the plot element\n"," fig, all_ax = plt.subplots(1,3,figsize=(18,5))\n"," plt.close()\n","\n"," # Parameter Space graph\n"," ax = all_ax[0]\n","\n"," ax.set_xlim(a_range)\n"," ax.set_ylim(b_range)\n"," ax.set_xlabel(\"a\")\n"," ax.set_ylabel(\"b\")\n"," ax.set_title(\"Parameter space (a & b)\")\n","\n"," init_lines = []\n"," init_lines.append(ax.plot(real_a, real_b, 'go'))\n","\n"," line2, = ax.plot([], [], lw=2)\n"," \n"," # Cost Graph\n"," ax = all_ax[1]\n","\n"," ax.set_xlim((0, len(all_costs)))\n"," ax.set_ylim(costs_range)\n"," \n"," ax.set_xlabel(\"Iteration\")\n"," ax.set_ylabel(\"Log(Cost(Training))\")\n"," ax.set_title(\"Cost Function\")\n","\n"," cost_line, = ax.plot([], [])\n"," \n"," # ax+b and data\n"," ax = all_ax[2]\n"," \n"," init_lines.append(ax.plot(x, y, 'bo'))\n"," line3, = ax.plot([], [], lw=2)\n"," \n"," # Hardcode data limits cause I am lazy.\n"," ax.set_xlim((0, 5))\n"," ax.set_ylim((0,4))\n"," \n"," ax.set_xlabel(\"x\")\n"," ax.set_ylabel(\"y (i.e. label)\")\n"," ax.set_title(\"Estimate = ax+b\")\n"," \n","\n"," # initialization function: plot the background of each frame\n"," def init(): \n"," return init_lines\n","\n"," # animation function: this is called sequentially\n"," def update_frame(i):\n"," line2.set_data(all_a_est[0:i], all_b_est[0:i])\n"," cost_line.set_data(np.arange(0,i), all_costs[0:i])\n"," line3.set_data([0,5], [all_b_est[i], all_a_est[i]*5+all_b_est[i]])\n"," return (line2,cost_line,)\n","\n"," anim = animation.FuncAnimation(fig, update_frame, init_func=init, frames=len(all_a_est), interval=100, blit=False)\n","\n"," rc('animation', html='jshtml')\n"," return anim"],"execution_count":0,"outputs":[]},{"metadata":{"id":"8AGJG3UOMsuG","colab_type":"code","cellView":"form","colab":{}},"cell_type":"code","source":["#@title Animation Test Code. Ignore.\n","animate([0,1,0.5,.6,.3], [1,.3,.4,.2,.9],[0,1,2,1,3]) # Test case for animation code."],"execution_count":0,"outputs":[]},{"metadata":{"id":"usGdTwMYHC6Y","colab_type":"code","colab":{}},"cell_type":"code","source":["def step(x_batch, y_batch ,a_est, b_est, lam=0, eta=1):\n"," c = cost_function(x_batch,y_batch,a=a_est,b=b_est)\n"," g = gradient_of_cost(x_batch,y_batch,a=a_est,b=b_est,lam=lam)\n"," \n"," a_new = a_est - eta * g[0]\n"," b_new = b_est - eta * g[1]\n"," \n"," #print (\"a,b = %f, %f. C = %f. Ga,b = %f, %f. New a,b = %f, %f.\" % (a_est, b_est, c, g[0], g[1], a_new, b_new))\n"," \n"," return (a_new, b_new)"],"execution_count":0,"outputs":[]},{"metadata":{"id":"c2rgP69K970A","colab_type":"code","colab":{}},"cell_type":"code","source":["def stochastic_gradient_descent(x, y, initial_a, initial_b, eta):\n"," a_est = initial_a\n"," b_est = initial_b\n","\n"," # Lambda\n"," lam = 0\n","\n"," all_a_est = [a_est]\n"," all_b_est = [b_est]\n"," all_costs = [cost_function(x,y,a=a_est,b=b_est)]\n","\n"," for i in xrange(50):\n"," n = random.randint(0, len(x)-1) # Batch size of 1.\n"," a_est, b_est = step(x[n], y[n], a_est, b_est, lam, eta)\n","\n"," # Normally we wouldn't collect cost and parameters every iteration, \n"," # but this is a very simple function to learn.\n"," all_a_est.append(a_est)\n"," all_b_est.append(b_est)\n"," # This is the batch training cost...this should be fixed to be validation set.\n"," all_costs.append(math.log(cost_function(x,y,a=a_est,b=b_est)))\n","\n"," print(\"Final a,b = %f, %f.\" % (a_est, b_est))\n"," \n"," return a_est, b_est, all_a_est, all_b_est, all_costs"],"execution_count":0,"outputs":[]},{"metadata":{"id":"pQgHzps1FBZt","colab_type":"code","outputId":"ba3f197c-9c7e-4fd4-dca5-c988c2d2d450","executionInfo":{"status":"ok","timestamp":1548800060036,"user_tz":360,"elapsed":15556,"user":{"displayName":"Trevor Walker","photoUrl":"https://lh4.googleusercontent.com/-0gCs3ZxKn2k/AAAAAAAAAAI/AAAAAAAAAdo/gxB0o7HADN8/s64/photo.jpg","userId":"06110071502655536310"}},"colab":{"base_uri":"https://localhost:8080/","height":486,"output_embedded_package_id":"19NohQRjkuc9MFsXVDFj2DYdHpUiYOTID"}},"cell_type":"code","source":["_, _, all_a_est, all_b_est, all_costs = stochastic_gradient_descent(x, y, 0, 0, eta=0.04)\n","animate(all_a_est, all_b_est, all_costs)"],"execution_count":29,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]},{"metadata":{"id":"EataBBlbIYZN","colab_type":"code","outputId":"354f629c-74b9-47b7-ed1f-afc0615cf1b7","executionInfo":{"status":"ok","timestamp":1548800178266,"user_tz":360,"elapsed":16620,"user":{"displayName":"Trevor Walker","photoUrl":"https://lh4.googleusercontent.com/-0gCs3ZxKn2k/AAAAAAAAAAI/AAAAAAAAAdo/gxB0o7HADN8/s64/photo.jpg","userId":"06110071502655536310"}},"colab":{"base_uri":"https://localhost:8080/","height":486,"output_embedded_package_id":"1Gx4dczotyLdn_0NS01XNwbql3Fvhw59A"}},"cell_type":"code","source":["_, _, all_a_est, all_b_est, all_costs = stochastic_gradient_descent(x, y, 0, 0, eta=0.11) # eta too big.\n","animate(all_a_est, all_b_est, all_costs)"],"execution_count":30,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]},{"metadata":{"id":"VYQB89B0MmlW","colab_type":"code","outputId":"08c81bf2-08fe-498f-c7e5-84f24148ee36","executionInfo":{"status":"ok","timestamp":1548800190832,"user_tz":360,"elapsed":27009,"user":{"displayName":"Trevor Walker","photoUrl":"https://lh4.googleusercontent.com/-0gCs3ZxKn2k/AAAAAAAAAAI/AAAAAAAAAdo/gxB0o7HADN8/s64/photo.jpg","userId":"06110071502655536310"}},"colab":{"base_uri":"https://localhost:8080/","height":486,"output_embedded_package_id":"1KrlSPO9GimCZ_zM-BaL1Sk-O0Wb31L27"}},"cell_type":"code","source":["_, _, all_a_est, all_b_est, all_costs = stochastic_gradient_descent(x, y, 0, 0, eta=0.001) # eta too low.\n","animate(all_a_est, all_b_est, all_costs)"],"execution_count":31,"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}]},{"metadata":{"id":"fwNq1MY0gAot","colab_type":"code","colab":{}},"cell_type":"code","source":[""],"execution_count":0,"outputs":[]}]}