updated PySpark demo to run on binder

shagunsodhani · shagunsodhani · commit 8370e57bda27 · 2016-03-29T14:53:02.000+05:30
diff --git a/README.md b/README.md
@@ -1,3 +1,3 @@
 Repository of my talks/presentations
 
-* [Introduction to PySpark]()
+* [Introduction to PySpark](https://github.com/shagunsodhani/talks/tree/master/spark/PyDelhi)
diff --git a/spark/PyDelhi/PySpark.ipynb b/spark/PyDelhi/PySpark.ipynb
@@ -2,9 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 42,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -15,7 +15,18 @@
     "from pyspark.sql import SQLContext\n",
     "from pyspark.sql import Row\n",
     "from pyspark.sql.types import *\n",
-    "import pyspark.sql.functions as func"
+    "import pyspark.sql.functions as func\n",
+    "from pyspark import SparkContext\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "If the notebook is run locally, then sc (SparkContext) would be pre-configured. If running using binder, we need to create SparkContext."
    ]
   },
   {
@@ -28,7 +39,7 @@
     {
      "data": {
       "text/plain": [
-       "<pyspark.context.SparkContext at 0x7f1084022f50>"
+       "<pyspark.context.SparkContext at 0x7fa3ec13d090>"
       ]
      },
      "execution_count": 2,
@@ -38,6 +49,19 @@
    ],
    "source": [
     "#This notebook comes with a pre-configured sparkContext called sc\n",
+    "try:\n",
+    "    sc\n",
+    "except NameError:\n",
+    "    sc = SparkContext(master='spark://master:7077')\n",
+    "    with open(\"data/sequence.txt\") as f:\n",
+    "         sequence = [x.strip('\\n') for x in f.readlines()]\n",
+    "    file_rdd = sc.parallelize(sequence)\n",
+    "    with open(\"data/people.json\") as f:\n",
+    "        json_data = [x.strip('\\n') for x in f.readlines()]\n",
+    "    json_rdd = sc.parallelize(json_data)\n",
+    "else:\n",
+    "    file_rdd = sc.textFile(\"data/sequence.txt\")\n",
+    "    json_rdd = sc.textFile(\"data/people.json\")\n",
     "sc"
    ]
   },
@@ -76,11 +100,8 @@
    "outputs": [],
    "source": [
     "#More RDDs\n",
-    "file_path = \"data/sequence.txt\"\n",
-    "file_rdd = sc.textFile(file_path)\n",
-    "json_rdd = sc.textFile(\"data/people.json\")\n",
-    "# print json_rdd.collect()\n",
-    "# print type(json_rdd.collect())\n",
+    "print json_rdd.collect()\n",
+    "print type(json_rdd.collect())\n",
     "#Spark supports text files, SequenceFiles, and any other Hadoop InputFormat."
    ]
   },
@@ -95,7 +116,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken (in seconds) = 0.00355696678162\n"
+      "Time taken (in seconds) = 9.91821289062e-05\n"
      ]
     }
    ],
@@ -117,7 +138,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken (in seconds) = 5.95632791519\n"
+      "Time taken (in seconds) = 1.08630800247\n"
      ]
     }
    ],
@@ -139,7 +160,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken (in seconds) = 0.00350093841553\n"
+      "Time taken (in seconds) = 6.79492950439e-05\n"
      ]
     }
    ],
@@ -161,7 +182,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time taken (in seconds) = 2.28691196442\n"
+      "Time taken (in seconds) = 1.19471502304\n"
      ]
     }
    ],
@@ -211,7 +232,7 @@
      "output_type": "stream",
      "text": [
       "10\n",
-      "Time taken (in seconds) = 0.129802942276\n"
+      "Time taken (in seconds) = 0.0378739833832\n"
      ]
     }
    ],
@@ -234,7 +255,7 @@
      "output_type": "stream",
      "text": [
       "100000\n",
-      "Time taken (in seconds) = 0.553807973862\n"
+      "Time taken (in seconds) = 0.257553100586\n"
      ]
     }
    ],
@@ -257,7 +278,7 @@
      "output_type": "stream",
      "text": [
       "1000000\n",
-      "Time taken (in seconds) = 2.25160479546\n"
+      "Time taken (in seconds) = 1.15057992935\n"
      ]
     }
    ],
@@ -300,15 +321,15 @@
      "text": [
       "We want to count the number of 1, 2, ... digit numbers.\n",
       "[(1, 9), (2, 90), (3, 900), (4, 9000), (5, 90000), (6, 900000), (7, 1)]\n",
-      "Time taken (in seconds) = 1.79352688789\n"
+      "Time taken (in seconds) = 1.0940117836\n"
      ]
     }
    ],
    "source": [
     "start_time = time()\n",
     "print \"We want to count the number of 1, 2, ... digit numbers.\"\n",
-    "file_path = \"data/sequence.txt\"\n",
-    "file_rdd = sc.textFile(file_path) \n",
+    "# file_path = \"data/sequence.txt\"\n",
+    "# file_rdd = sc.textFile(file_path) \n",
     "mapped_rdd = file_rdd.map(lambda a: (len(a), 1))\n",
     "count_rdd = mapped_rdd.reduceByKey(lambda a, b: a+b).sortByKey()\n",
     "print count_rdd.collect()\n",
@@ -328,7 +349,7 @@
      "output_type": "stream",
      "text": [
       "We want to count the number of 1, 2, ... digit numbers.\n",
-      "Time taken (in seconds) = 7.90541195869\n"
+      "Time taken (in seconds) = 4.63194608688\n"
      ]
     }
    ],
@@ -415,7 +436,7 @@
     {
      "data": {
       "text/plain": [
-       "<pyspark.sql.context.SQLContext at 0x7f10669611d0>"
+       "<pyspark.sql.context.SQLContext at 0x7fa3c1f88690>"
       ]
      },
      "execution_count": 18,
@@ -507,35 +528,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "Can not infer schema for type: <type 'unicode'>",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-21-dc35359ffec1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mrdd_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfile_rdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoDF\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36mtoDF\u001b[1;34m(self, schema, sampleRatio)\u001b[0m\n\u001b[0;32m     60\u001b[0m         \u001b[1;33m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Alice'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     61\u001b[0m         \"\"\"\n\u001b[1;32m---> 62\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0msqlContext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreateDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msampleRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     64\u001b[0m     \u001b[0mRDD\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoDF\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtoDF\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36mcreateDataFrame\u001b[1;34m(self, data, schema, samplingRatio)\u001b[0m\n\u001b[0;32m    402\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    403\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 404\u001b[1;33m             \u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_createFromRDD\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamplingRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    405\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    406\u001b[0m             \u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_createFromLocal\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36m_createFromRDD\u001b[1;34m(self, rdd, schema, samplingRatio)\u001b[0m\n\u001b[0;32m    283\u001b[0m         \"\"\"\n\u001b[0;32m    284\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 285\u001b[1;33m             \u001b[0mstruct\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_inferSchema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamplingRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    286\u001b[0m             \u001b[0mconverter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_create_converter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstruct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    287\u001b[0m             \u001b[0mrdd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconverter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36m_inferSchema\u001b[1;34m(self, rdd, samplingRatio)\u001b[0m\n\u001b[0;32m    236\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    237\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0msamplingRatio\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 238\u001b[1;33m             \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_infer_schema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfirst\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    239\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0m_has_nulltype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[1;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/types.pyc\u001b[0m in \u001b[0;36m_infer_schema\u001b[1;34m(row)\u001b[0m\n\u001b[0;32m    829\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    830\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 831\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can not infer schema for type: %s\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    832\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    833\u001b[0m     \u001b[0mfields\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mStructField\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_infer_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mitems\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mTypeError\u001b[0m: Can not infer schema for type: <type 'unicode'>"
-     ]
-    }
-   ],
-   "source": [
-    "rdd_df = file_rdd.toDF()"
-   ]
+   "outputs": [],
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "metadata": {
     "collapsed": false
    },
@@ -584,7 +586,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "metadata": {
     "collapsed": false
    },
@@ -597,7 +599,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "metadata": {
     "collapsed": false
    },
@@ -641,7 +643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {
     "collapsed": false
    },
@@ -662,7 +664,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
    "metadata": {
     "collapsed": false
    },
@@ -706,7 +708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
    "metadata": {
     "collapsed": false
    },
@@ -731,7 +733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
    "metadata": {
     "collapsed": false
    },
@@ -753,15 +755,15 @@
       "|dmatthewsb7@image...|    1|\n",
       "| jpowellbk@wiley.com|    1|\n",
       "|arogersel@rediff.com|    1|\n",
+      "| sanderson4r@ask.com|    1|\n",
+      "|  fcoopercq@live.com|    1|\n",
+      "|ladamsgf@hubpages...|    1|\n",
+      "|rbellgv@bluehost.com|    1|\n",
       "| ameyer16@rambler.ru|    1|\n",
       "|     bsims67@msn.com|    1|\n",
       "|mjoneslf@wootheme...|    1|\n",
       "|mfranklinn9@hao12...|    1|\n",
       "|aandrewspf@redcro...|    1|\n",
-      "| sanderson4r@ask.com|    1|\n",
-      "|  fcoopercq@live.com|    1|\n",
-      "|ladamsgf@hubpages...|    1|\n",
-      "|rbellgv@bluehost.com|    1|\n",
       "|abrown36@yellowpa...|    1|\n",
       "+--------------------+-----+\n",
       "only showing top 20 rows\n",
@@ -775,7 +777,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 28,
    "metadata": {
     "collapsed": false
    },
@@ -795,7 +797,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 29,
    "metadata": {
     "collapsed": false
    },
@@ -839,7 +841,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 30,
    "metadata": {
     "collapsed": false
    },
@@ -853,25 +855,25 @@
       "|              domain|count|\n",
       "+--------------------+-----+\n",
       "|         alibaba.com|    8|\n",
-      "|        examiner.com|    7|\n",
       "|             163.com|    7|\n",
-      "|       woothemes.com|    6|\n",
-      "|             mlb.com|    6|\n",
+      "|        examiner.com|    7|\n",
       "|      friendfeed.com|    6|\n",
-      "|             fda.gov|    6|\n",
+      "|            lulu.com|    6|\n",
       "|             free.fr|    6|\n",
+      "|             fda.gov|    6|\n",
       "|           apple.com|    6|\n",
-      "|     sourceforge.net|    6|\n",
+      "|       woothemes.com|    6|\n",
       "|         cornell.edu|    6|\n",
-      "|            lulu.com|    6|\n",
+      "|     sourceforge.net|    6|\n",
+      "|             mlb.com|    6|\n",
+      "|           wikia.com|    5|\n",
+      "|        engadget.com|    5|\n",
+      "|pagesperso-orange.fr|    5|\n",
       "|             usa.gov|    5|\n",
+      "|       wordpress.org|    5|\n",
       "|        cbslocal.com|    5|\n",
-      "|pagesperso-orange.fr|    5|\n",
-      "|             pbs.org|    5|\n",
-      "|        gravatar.com|    5|\n",
       "|            ucla.edu|    5|\n",
-      "|             nyu.edu|    5|\n",
-      "|       webeden.co.uk|    5|\n",
+      "|             pbs.org|    5|\n",
       "+--------------------+-----+\n",
       "only showing top 20 rows\n",
       "\n"
@@ -884,6 +886,15 @@
     "df.select(domain(df.email).alias('domain'))\\\n",
     ".groupBy('domain').agg(func.count('domain').alias(\"count\")).orderBy(\"count\", ascending=False).show()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/spark/PyDelhi/README.md b/spark/PyDelhi/README.md
@@ -8,7 +8,9 @@ Introduction to PySpark
 
 ## Demo
 
-Execute `run.sh` to run the demo locally or use [this.](https://github.com/shagunsodhani/talks/blob/master/spark/PySpark.ipynb)
+[![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/shagunsodhani/talks)
+
+Execute `run.sh` to run the demo locally or use [this.](https://github.com/shagunsodhani/talks/blob/master/spark/PyDelhi/PySpark.ipynb)
 
 ## Event
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`Repository of my talks/presentations`
`2`	`2`
`3`		`-* [Introduction to PySpark]()`
	`3`	`+* [Introduction to PySpark](https://github.com/shagunsodhani/talks/tree/master/spark/PyDelhi)`