|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": 42, |
6 | 6 | "metadata": { |
7 | | - "collapsed": true |
| 7 | + "collapsed": false |
8 | 8 | }, |
9 | 9 | "outputs": [], |
10 | 10 | "source": [ |
|
15 | 15 | "from pyspark.sql import SQLContext\n", |
16 | 16 | "from pyspark.sql import Row\n", |
17 | 17 | "from pyspark.sql.types import *\n", |
18 | | - "import pyspark.sql.functions as func" |
| 18 | + "import pyspark.sql.functions as func\n", |
| 19 | + "from pyspark import SparkContext\n", |
| 20 | + "import json" |
| 21 | + ] |
| 22 | + }, |
| 23 | + { |
| 24 | + "cell_type": "markdown", |
| 25 | + "metadata": { |
| 26 | + "collapsed": false |
| 27 | + }, |
| 28 | + "source": [ |
| 29 | + "If the notebook is run locally, then sc (SparkContext) would be pre-configured. If running using binder, we need to create SparkContext." |
19 | 30 | ] |
20 | 31 | }, |
21 | 32 | { |
|
28 | 39 | { |
29 | 40 | "data": { |
30 | 41 | "text/plain": [ |
31 | | - "<pyspark.context.SparkContext at 0x7f1084022f50>" |
| 42 | + "<pyspark.context.SparkContext at 0x7fa3ec13d090>" |
32 | 43 | ] |
33 | 44 | }, |
34 | 45 | "execution_count": 2, |
|
38 | 49 | ], |
39 | 50 | "source": [ |
40 | 51 | "#This notebook comes with a pre-configured sparkContext called sc\n", |
| 52 | + "try:\n", |
| 53 | + " sc\n", |
| 54 | + "except NameError:\n", |
| 55 | + " sc = SparkContext(master='spark://master:7077')\n", |
| 56 | + " with open(\"data/sequence.txt\") as f:\n", |
| 57 | + " sequence = [x.strip('\\n') for x in f.readlines()]\n", |
| 58 | + " file_rdd = sc.parallelize(sequence)\n", |
| 59 | + " with open(\"data/people.json\") as f:\n", |
| 60 | + " json_data = [x.strip('\\n') for x in f.readlines()]\n", |
| 61 | + " json_rdd = sc.parallelize(json_data)\n", |
| 62 | + "else:\n", |
| 63 | + " file_rdd = sc.textFile(\"data/sequence.txt\")\n", |
| 64 | + " json_rdd = sc.textFile(\"data/people.json\")\n", |
41 | 65 | "sc" |
42 | 66 | ] |
43 | 67 | }, |
|
76 | 100 | "outputs": [], |
77 | 101 | "source": [ |
78 | 102 | "#More RDDs\n", |
79 | | - "file_path = \"data/sequence.txt\"\n", |
80 | | - "file_rdd = sc.textFile(file_path)\n", |
81 | | - "json_rdd = sc.textFile(\"data/people.json\")\n", |
82 | | - "# print json_rdd.collect()\n", |
83 | | - "# print type(json_rdd.collect())\n", |
| 103 | + "print json_rdd.collect()\n", |
| 104 | + "print type(json_rdd.collect())\n", |
84 | 105 | "#Spark supports text files, SequenceFiles, and any other Hadoop InputFormat." |
85 | 106 | ] |
86 | 107 | }, |
|
95 | 116 | "name": "stdout", |
96 | 117 | "output_type": "stream", |
97 | 118 | "text": [ |
98 | | - "Time taken (in seconds) = 0.00355696678162\n" |
| 119 | + "Time taken (in seconds) = 9.91821289062e-05\n" |
99 | 120 | ] |
100 | 121 | } |
101 | 122 | ], |
|
117 | 138 | "name": "stdout", |
118 | 139 | "output_type": "stream", |
119 | 140 | "text": [ |
120 | | - "Time taken (in seconds) = 5.95632791519\n" |
| 141 | + "Time taken (in seconds) = 1.08630800247\n" |
121 | 142 | ] |
122 | 143 | } |
123 | 144 | ], |
|
139 | 160 | "name": "stdout", |
140 | 161 | "output_type": "stream", |
141 | 162 | "text": [ |
142 | | - "Time taken (in seconds) = 0.00350093841553\n" |
| 163 | + "Time taken (in seconds) = 6.79492950439e-05\n" |
143 | 164 | ] |
144 | 165 | } |
145 | 166 | ], |
|
161 | 182 | "name": "stdout", |
162 | 183 | "output_type": "stream", |
163 | 184 | "text": [ |
164 | | - "Time taken (in seconds) = 2.28691196442\n" |
| 185 | + "Time taken (in seconds) = 1.19471502304\n" |
165 | 186 | ] |
166 | 187 | } |
167 | 188 | ], |
|
211 | 232 | "output_type": "stream", |
212 | 233 | "text": [ |
213 | 234 | "10\n", |
214 | | - "Time taken (in seconds) = 0.129802942276\n" |
| 235 | + "Time taken (in seconds) = 0.0378739833832\n" |
215 | 236 | ] |
216 | 237 | } |
217 | 238 | ], |
|
234 | 255 | "output_type": "stream", |
235 | 256 | "text": [ |
236 | 257 | "100000\n", |
237 | | - "Time taken (in seconds) = 0.553807973862\n" |
| 258 | + "Time taken (in seconds) = 0.257553100586\n" |
238 | 259 | ] |
239 | 260 | } |
240 | 261 | ], |
|
257 | 278 | "output_type": "stream", |
258 | 279 | "text": [ |
259 | 280 | "1000000\n", |
260 | | - "Time taken (in seconds) = 2.25160479546\n" |
| 281 | + "Time taken (in seconds) = 1.15057992935\n" |
261 | 282 | ] |
262 | 283 | } |
263 | 284 | ], |
|
300 | 321 | "text": [ |
301 | 322 | "We want to count the number of 1, 2, ... digit numbers.\n", |
302 | 323 | "[(1, 9), (2, 90), (3, 900), (4, 9000), (5, 90000), (6, 900000), (7, 1)]\n", |
303 | | - "Time taken (in seconds) = 1.79352688789\n" |
| 324 | + "Time taken (in seconds) = 1.0940117836\n" |
304 | 325 | ] |
305 | 326 | } |
306 | 327 | ], |
307 | 328 | "source": [ |
308 | 329 | "start_time = time()\n", |
309 | 330 | "print \"We want to count the number of 1, 2, ... digit numbers.\"\n", |
310 | | - "file_path = \"data/sequence.txt\"\n", |
311 | | - "file_rdd = sc.textFile(file_path) \n", |
| 331 | + "# file_path = \"data/sequence.txt\"\n", |
| 332 | + "# file_rdd = sc.textFile(file_path) \n", |
312 | 333 | "mapped_rdd = file_rdd.map(lambda a: (len(a), 1))\n", |
313 | 334 | "count_rdd = mapped_rdd.reduceByKey(lambda a, b: a+b).sortByKey()\n", |
314 | 335 | "print count_rdd.collect()\n", |
|
328 | 349 | "output_type": "stream", |
329 | 350 | "text": [ |
330 | 351 | "We want to count the number of 1, 2, ... digit numbers.\n", |
331 | | - "Time taken (in seconds) = 7.90541195869\n" |
| 352 | + "Time taken (in seconds) = 4.63194608688\n" |
332 | 353 | ] |
333 | 354 | } |
334 | 355 | ], |
|
415 | 436 | { |
416 | 437 | "data": { |
417 | 438 | "text/plain": [ |
418 | | - "<pyspark.sql.context.SQLContext at 0x7f10669611d0>" |
| 439 | + "<pyspark.sql.context.SQLContext at 0x7fa3c1f88690>" |
419 | 440 | ] |
420 | 441 | }, |
421 | 442 | "execution_count": 18, |
|
507 | 528 | }, |
508 | 529 | { |
509 | 530 | "cell_type": "code", |
510 | | - "execution_count": 21, |
| 531 | + "execution_count": null, |
511 | 532 | "metadata": { |
512 | 533 | "collapsed": false |
513 | 534 | }, |
514 | | - "outputs": [ |
515 | | - { |
516 | | - "ename": "TypeError", |
517 | | - "evalue": "Can not infer schema for type: <type 'unicode'>", |
518 | | - "output_type": "error", |
519 | | - "traceback": [ |
520 | | - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", |
521 | | - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", |
522 | | - "\u001b[1;32m<ipython-input-21-dc35359ffec1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mrdd_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfile_rdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoDF\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", |
523 | | - "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36mtoDF\u001b[1;34m(self, schema, sampleRatio)\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Alice'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 61\u001b[0m \"\"\"\n\u001b[1;32m---> 62\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msqlContext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcreateDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msampleRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[0mRDD\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoDF\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtoDF\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", |
524 | | - "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36mcreateDataFrame\u001b[1;34m(self, data, schema, samplingRatio)\u001b[0m\n\u001b[0;32m 402\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 403\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mRDD\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 404\u001b[1;33m \u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_createFromRDD\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamplingRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 405\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 406\u001b[0m \u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_createFromLocal\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", |
525 | | - "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36m_createFromRDD\u001b[1;34m(self, rdd, schema, samplingRatio)\u001b[0m\n\u001b[0;32m 283\u001b[0m \"\"\"\n\u001b[0;32m 284\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mschema\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 285\u001b[1;33m \u001b[0mstruct\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_inferSchema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrdd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msamplingRatio\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 286\u001b[0m \u001b[0mconverter\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_create_converter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstruct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 287\u001b[0m \u001b[0mrdd\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconverter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", |
526 | | - "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/context.pyc\u001b[0m in \u001b[0;36m_inferSchema\u001b[1;34m(self, rdd, samplingRatio)\u001b[0m\n\u001b[0;32m 236\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 237\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msamplingRatio\u001b[0m \u001b[1;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 238\u001b[1;33m \u001b[0mschema\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_infer_schema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfirst\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 239\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0m_has_nulltype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 240\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrdd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", |
527 | | - "\u001b[1;32m/home/shagun/devsetup/spark-1.5.1-bin-hadoop2.6/python/pyspark/sql/types.pyc\u001b[0m in \u001b[0;36m_infer_schema\u001b[1;34m(row)\u001b[0m\n\u001b[0;32m 829\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 830\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 831\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can not infer schema for type: %s\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 832\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 833\u001b[0m \u001b[0mfields\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mStructField\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_infer_type\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mitems\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", |
528 | | - "\u001b[1;31mTypeError\u001b[0m: Can not infer schema for type: <type 'unicode'>" |
529 | | - ] |
530 | | - } |
531 | | - ], |
532 | | - "source": [ |
533 | | - "rdd_df = file_rdd.toDF()" |
534 | | - ] |
| 535 | + "outputs": [], |
| 536 | + "source": [] |
535 | 537 | }, |
536 | 538 | { |
537 | 539 | "cell_type": "code", |
538 | | - "execution_count": 22, |
| 540 | + "execution_count": 21, |
539 | 541 | "metadata": { |
540 | 542 | "collapsed": false |
541 | 543 | }, |
|
584 | 586 | }, |
585 | 587 | { |
586 | 588 | "cell_type": "code", |
587 | | - "execution_count": 23, |
| 589 | + "execution_count": 22, |
588 | 590 | "metadata": { |
589 | 591 | "collapsed": false |
590 | 592 | }, |
|
597 | 599 | }, |
598 | 600 | { |
599 | 601 | "cell_type": "code", |
600 | | - "execution_count": 24, |
| 602 | + "execution_count": 23, |
601 | 603 | "metadata": { |
602 | 604 | "collapsed": false |
603 | 605 | }, |
|
641 | 643 | }, |
642 | 644 | { |
643 | 645 | "cell_type": "code", |
644 | | - "execution_count": 25, |
| 646 | + "execution_count": 24, |
645 | 647 | "metadata": { |
646 | 648 | "collapsed": false |
647 | 649 | }, |
|
662 | 664 | }, |
663 | 665 | { |
664 | 666 | "cell_type": "code", |
665 | | - "execution_count": 26, |
| 667 | + "execution_count": 25, |
666 | 668 | "metadata": { |
667 | 669 | "collapsed": false |
668 | 670 | }, |
|
706 | 708 | }, |
707 | 709 | { |
708 | 710 | "cell_type": "code", |
709 | | - "execution_count": 27, |
| 711 | + "execution_count": 26, |
710 | 712 | "metadata": { |
711 | 713 | "collapsed": false |
712 | 714 | }, |
|
731 | 733 | }, |
732 | 734 | { |
733 | 735 | "cell_type": "code", |
734 | | - "execution_count": 28, |
| 736 | + "execution_count": 27, |
735 | 737 | "metadata": { |
736 | 738 | "collapsed": false |
737 | 739 | }, |
|
753 | 755 | "|dmatthewsb7@image...| 1|\n", |
754 | 756 | |
755 | 757 | |
| 758 | + |
| 759 | + |
| 760 | + "|ladamsgf@hubpages...| 1|\n", |
| 761 | + |
756 | 762 | |
757 | 763 | |
758 | 764 | "|mjoneslf@wootheme...| 1|\n", |
759 | 765 | "|mfranklinn9@hao12...| 1|\n", |
760 | 766 | "|aandrewspf@redcro...| 1|\n", |
761 | | - |
762 | | - |
763 | | - "|ladamsgf@hubpages...| 1|\n", |
764 | | - |
765 | 767 | "|abrown36@yellowpa...| 1|\n", |
766 | 768 | "+--------------------+-----+\n", |
767 | 769 | "only showing top 20 rows\n", |
|
775 | 777 | }, |
776 | 778 | { |
777 | 779 | "cell_type": "code", |
778 | | - "execution_count": 29, |
| 780 | + "execution_count": 28, |
779 | 781 | "metadata": { |
780 | 782 | "collapsed": false |
781 | 783 | }, |
|
795 | 797 | }, |
796 | 798 | { |
797 | 799 | "cell_type": "code", |
798 | | - "execution_count": 30, |
| 800 | + "execution_count": 29, |
799 | 801 | "metadata": { |
800 | 802 | "collapsed": false |
801 | 803 | }, |
|
839 | 841 | }, |
840 | 842 | { |
841 | 843 | "cell_type": "code", |
842 | | - "execution_count": 31, |
| 844 | + "execution_count": 30, |
843 | 845 | "metadata": { |
844 | 846 | "collapsed": false |
845 | 847 | }, |
|
853 | 855 | "| domain|count|\n", |
854 | 856 | "+--------------------+-----+\n", |
855 | 857 | "| alibaba.com| 8|\n", |
856 | | - "| examiner.com| 7|\n", |
857 | 858 | "| 163.com| 7|\n", |
858 | | - "| woothemes.com| 6|\n", |
859 | | - "| mlb.com| 6|\n", |
| 859 | + "| examiner.com| 7|\n", |
860 | 860 | "| friendfeed.com| 6|\n", |
861 | | - "| fda.gov| 6|\n", |
| 861 | + "| lulu.com| 6|\n", |
862 | 862 | "| free.fr| 6|\n", |
| 863 | + "| fda.gov| 6|\n", |
863 | 864 | "| apple.com| 6|\n", |
864 | | - "| sourceforge.net| 6|\n", |
| 865 | + "| woothemes.com| 6|\n", |
865 | 866 | "| cornell.edu| 6|\n", |
866 | | - "| lulu.com| 6|\n", |
| 867 | + "| sourceforge.net| 6|\n", |
| 868 | + "| mlb.com| 6|\n", |
| 869 | + "| wikia.com| 5|\n", |
| 870 | + "| engadget.com| 5|\n", |
| 871 | + "|pagesperso-orange.fr| 5|\n", |
867 | 872 | "| usa.gov| 5|\n", |
| 873 | + "| wordpress.org| 5|\n", |
868 | 874 | "| cbslocal.com| 5|\n", |
869 | | - "|pagesperso-orange.fr| 5|\n", |
870 | | - "| pbs.org| 5|\n", |
871 | | - "| gravatar.com| 5|\n", |
872 | 875 | "| ucla.edu| 5|\n", |
873 | | - "| nyu.edu| 5|\n", |
874 | | - "| webeden.co.uk| 5|\n", |
| 876 | + "| pbs.org| 5|\n", |
875 | 877 | "+--------------------+-----+\n", |
876 | 878 | "only showing top 20 rows\n", |
877 | 879 | "\n" |
|
884 | 886 | "df.select(domain(df.email).alias('domain'))\\\n", |
885 | 887 | ".groupBy('domain').agg(func.count('domain').alias(\"count\")).orderBy(\"count\", ascending=False).show()" |
886 | 888 | ] |
| 889 | + }, |
| 890 | + { |
| 891 | + "cell_type": "code", |
| 892 | + "execution_count": null, |
| 893 | + "metadata": { |
| 894 | + "collapsed": true |
| 895 | + }, |
| 896 | + "outputs": [], |
| 897 | + "source": [] |
887 | 898 | } |
888 | 899 | ], |
889 | 900 | "metadata": { |
|
0 commit comments