|
64 | 64 | }
|
65 | 65 | ],
|
66 | 66 | "source": [
|
| 67 | + "# Listing 2.1: Load the data from the csv‐files\n", |
67 | 68 | "import pandas as pd\n",
|
68 | 69 | "\n",
|
69 | 70 | "train = pd.read_csv('./data/train.csv')\n",
|
|
86 | 87 | }
|
87 | 88 | ],
|
88 | 89 | "source": [
|
| 90 | + "# Listing 2.2: The shapes of the Titanic datasets\n", |
89 | 91 | "print('train has {} rows and {} columns'.format(*train.shape))\n",
|
90 | 92 | "print('test has {} rows and {} columns'.format(*test.shape))"
|
91 | 93 | ]
|
|
106 | 108 | }
|
107 | 109 | ],
|
108 | 110 | "source": [
|
| 111 | + "# Listing 2.3: The structure of the train dataset\n", |
109 | 112 | "train.info()"
|
110 | 113 | ]
|
111 | 114 | },
|
|
125 | 128 | }
|
126 | 129 | ],
|
127 | 130 | "source": [
|
| 131 | + "# Listing 2.4: The structure of the test dataset\n", |
128 | 132 | "test.info()"
|
129 | 133 | ]
|
130 | 134 | },
|
|
170 | 174 | }
|
171 | 175 | ],
|
172 | 176 | "source": [
|
| 177 | + "# Listing 2.5: Look at the data\n", |
173 | 178 | "train.head()"
|
174 | 179 | ]
|
175 | 180 | },
|
|
196 | 201 | }
|
197 | 202 | ],
|
198 | 203 | "source": [
|
| 204 | + "# Listing 2.6: Cope with missing values\n", |
199 | 205 | "# option 1\n",
|
200 | 206 | "# We only have two passengers without it. This is bearable\n",
|
201 | 207 | "train = train.dropna(subset=[\"Embarked\"]) \n",
|
|
234 | 240 | }
|
235 | 241 | ],
|
236 | 242 | "source": [
|
| 243 | + "# Listing 2.7: Unique values in columns\n", |
237 | 244 | "print('There are {} different (unique) PassengerIds in the data'\n",
|
238 | 245 | " .format(train[\"PassengerId\"].nunique()))\n",
|
239 | 246 | "print('There are {} different (unique) names in the data'\n",
|
|
258 | 265 | }
|
259 | 266 | ],
|
260 | 267 | "source": [
|
| 268 | + "# Listing 2.8: Remove identifying data\n", |
261 | 269 | "train = train.drop(\"PassengerId\", axis=1)\n",
|
262 | 270 | "train = train.drop(\"Name\", axis=1)\n",
|
263 | 271 | "train = train.drop(\"Ticket\", axis=1)\n",
|
|
293 | 301 | }
|
294 | 302 | ],
|
295 | 303 | "source": [
|
| 304 | + "# Listing 2.9: Transforming textual data into numbers\n", |
296 | 305 | "from sklearn.preprocessing import LabelEncoder\n",
|
297 | 306 | "le = LabelEncoder()\n",
|
298 | 307 | "\n",
|
|
319 | 328 | }
|
320 | 329 | ],
|
321 | 330 | "source": [
|
| 331 | + "# Listing 2.10: The maximum values\n", |
322 | 332 | "print('The maximum age is {}'.format(train[\"Age\"].max()))\n",
|
323 | 333 | "print('The maximum fare is {}'.format(train[\"Fare\"].max()))"
|
324 | 334 | ]
|
|
339 | 349 | }
|
340 | 350 | ],
|
341 | 351 | "source": [
|
| 352 | + "# Listing 2.11: Normalization of the data.\n", |
342 | 353 | "from sklearn.preprocessing import MinMaxScaler\n",
|
343 | 354 | "\n",
|
344 | 355 | "scaler = MinMaxScaler()\n",
|
|
365 | 376 | }
|
366 | 377 | ],
|
367 | 378 | "source": [
|
| 379 | + "# Listing 2.12: Separating input from labels and training from testing sets\n", |
368 | 380 | "from sklearn.model_selection import train_test_split\n",
|
369 | 381 | "\n",
|
370 | 382 | "input_data = train[:, 1:8]\n",
|
|
383 | 395 | "metadata": {},
|
384 | 396 | "outputs": [],
|
385 | 397 | "source": [
|
| 398 | + "# Listing 2.13: Save the data to the filesystem\n", |
386 | 399 | "import numpy as np\n",
|
387 | 400 | "\n",
|
388 | 401 | "with open('data/train.npy', 'wb') as f:\n",
|
|
408 | 421 | "metadata": {},
|
409 | 422 | "outputs": [],
|
410 | 423 | "source": [
|
| 424 | + "# Listing 2.14: A random classifier\n", |
411 | 425 | "import random\n",
|
412 | 426 | "random.seed(a=None, version=2)\n",
|
413 | 427 | " \n",
|
|
421 | 435 | "metadata": {},
|
422 | 436 | "outputs": [],
|
423 | 437 | "source": [
|
| 438 | + "# Listing 2.15: The classification runner\n", |
424 | 439 | "def run(f_classify, x):\n",
|
425 | 440 | " return list(map(f_classify, x))"
|
426 | 441 | ]
|
|
433 | 448 | },
|
434 | 449 | "outputs": [],
|
435 | 450 | "source": [
|
| 451 | + "# Listing 2.16: Run the classifier\n", |
436 | 452 | "result = run(classify, train_input)"
|
437 | 453 | ]
|
438 | 454 | },
|
|
461 | 477 | }
|
462 | 478 | ],
|
463 | 479 | "source": [
|
| 480 | + "# Listing 2.17: Evaluate the classifier\n", |
464 | 481 | "def evaluate(predictions, actual):\n",
|
465 | 482 | " correct = list(filter(\n",
|
466 | 483 | " lambda item: item[0] == item[1],\n",
|
|
488 | 505 | }
|
489 | 506 | ],
|
490 | 507 | "source": [
|
| 508 | + "# Listing 2.18: Always predict a passenger died\n", |
491 | 509 | "def predict_death(item):\n",
|
492 | 510 | " return 0\n",
|
493 | 511 | "\n",
|
|
519 | 537 | }
|
520 | 538 | ],
|
521 | 539 | "source": [
|
| 540 | + "# Listing 2.19: Confustion matrix of the predict death classifier\n", |
522 | 541 | "from sklearn.metrics import confusion_matrix\n",
|
523 | 542 | "\n",
|
524 | 543 | "predictions = run(predict_death, train_input)\n",
|
|
541 | 560 | }
|
542 | 561 | ],
|
543 | 562 | "source": [
|
| 563 | + "# Listing 2.20: The precision score\n", |
544 | 564 | "from sklearn.metrics import precision_score\n",
|
545 | 565 | "print('The precision score of the predict_death classifier is {}'\n",
|
546 | 566 | " .format(precision_score(train_labels, predictions)))"
|
|
562 | 582 | }
|
563 | 583 | ],
|
564 | 584 | "source": [
|
| 585 | + "# Listing 2.21: The recall score\n", |
565 | 586 | "from sklearn.metrics import recall_score\n",
|
566 | 587 | "print('The recall score of the predict_death classifier is {}'\n",
|
567 | 588 | " .format(recall_score(train_labels, predictions)))"
|
|
583 | 604 | }
|
584 | 605 | ],
|
585 | 606 | "source": [
|
| 607 | + "# Listing 2.22: The specificity and the npv\n", |
586 | 608 | "def specificity(matrix):\n",
|
587 | 609 | " return matrix[0][0]/(matrix[0][0]+matrix[0][1]) if (matrix[0][0]+matrix[0][1] > 0) else 0\n",
|
588 | 610 | "\n",
|
|
611 | 633 | }
|
612 | 634 | ],
|
613 | 635 | "source": [
|
| 636 | + "# Listing 2.23: The scores of the random classifier\n", |
614 | 637 | "random_predictions = run(classify, train_input)\n",
|
615 | 638 | "random_cm = confusion_matrix(train_labels, random_predictions)\n",
|
616 | 639 | "\n",
|
|
637 | 660 | "metadata": {},
|
638 | 661 | "outputs": [],
|
639 | 662 | "source": [
|
| 663 | + "# Listing 2.24: A hypocrite classifier\n", |
640 | 664 | "def hypocrite(passenger, weight):\n",
|
641 | 665 | " return round(min(1,max(0,weight*0.5+random.uniform(0, 1))))"
|
642 | 666 | ]
|
|
657 | 681 | }
|
658 | 682 | ],
|
659 | 683 | "source": [
|
| 684 | + "# Listing 2.25: The scores of the hypocrite classifier\n", |
660 | 685 | "w_predictions = run(lambda passenger: hypocrite(passenger, -0.5), train_input)\n",
|
661 | 686 | "w_cm = confusion_matrix(train_labels, w_predictions)\n",
|
662 | 687 | "\n",
|
|
678 | 703 | },
|
679 | 704 | "outputs": [],
|
680 | 705 | "source": [
|
| 706 | + "# Listing 2.26: Run the hypocrite classifiers\n", |
681 | 707 | "import numpy as np\n",
|
682 | 708 | "\n",
|
683 | 709 | "# number of steps to consider between -1 and 1\n",
|
|
736 | 762 | }
|
737 | 763 | ],
|
738 | 764 | "source": [
|
| 765 | + "# Listing 2.27: Plot the distribution of predictions\n", |
739 | 766 | "import matplotlib.pyplot as plt\n",
|
740 | 767 | "import matplotlib\n",
|
741 | 768 | "\n",
|
|
768 | 795 | "metadata": {},
|
769 | 796 | "outputs": [],
|
770 | 797 | "source": [
|
| 798 | + "# Listing 2.28: Metrics of the hypocrite classifier\n", |
771 | 799 | "l_precision = list(map(lambda step: precision_score(train_labels, l_predictions[step]),steps))\n",
|
772 | 800 | "l_recall = list(map(lambda step: recall_score(train_labels, l_predictions[step]),steps))\n",
|
773 | 801 | "l_specificity = list(map(lambda step: specificity(l_cm[step]),steps))\n",
|
|
808 | 836 | }
|
809 | 837 | ],
|
810 | 838 | "source": [
|
| 839 | + "# Listing 2.29: Plot the performance measures\n", |
811 | 840 | "m_precision, = plt.plot(weights, l_precision, 'pink', label=\"precision\")\n",
|
812 | 841 | "m_recall, = plt.plot(weights, l_recall, 'cyan', label=\"recall\")\n",
|
813 | 842 | "m_specificity, = plt.plot(weights, l_specificity, 'gold', label=\"specificity\")\n",
|
|
851 | 880 | }
|
852 | 881 | ],
|
853 | 882 | "source": [
|
| 883 | + "# Listing 2.30: Calculating the mean of the measures\n", |
854 | 884 | "l_mean = list(map(lambda step: sum(step)*0.25, zip(l_precision, l_recall, l_specificity, l_npv)))\n",
|
855 | 885 | "m_mean, = plt.plot(weights, l_mean, 'pink', label=\"Mean of the measures\")\n",
|
856 | 886 | "\n",
|
|
866 | 896 | "metadata": {},
|
867 | 897 | "outputs": [],
|
868 | 898 | "source": [
|
| 899 | + "# Listing 2.31: A reusable function to unmask the hypocrite classifier\n", |
869 | 900 | "def classifier_report(name, run, classify, input, labels):\n",
|
870 | 901 | " cr_predictions = run(classify, input)\n",
|
871 | 902 | " cr_cm = confusion_matrix(labels, cr_predictions)\n",
|
|
904 | 935 | }
|
905 | 936 | ],
|
906 | 937 | "source": [
|
| 938 | + "# Listing 2.32: The report of the random classifier\n", |
907 | 939 | "classifier_report(\n",
|
908 | 940 | " \"Random PQC\", \n",
|
909 | 941 | " run,\n",
|
910 | 942 | " classify,\n",
|
911 | 943 | " train_input,\n",
|
912 |
| - " train_labels)\n", |
913 |
| - "#CAPTION The report of the random classifier" |
| 944 | + " train_labels)" |
914 | 945 | ]
|
915 | 946 | }
|
916 | 947 | ]
|
|
0 commit comments