Skip to content

Commit 11d0e51

Browse files
authored
Add file
1 parent 20333e2 commit 11d0e51

File tree

1 file changed

+263
-0
lines changed

1 file changed

+263
-0
lines changed

Logistic_Regression_Titanic.ipynb

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
13+
"|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked|\n",
14+
"+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
15+
"| 1| 0| 3|Braund, Mr. Owen ...| male|22.0| 1| 0| A/5 21171| 7.25| null| S|\n",
16+
"| 2| 1| 1|Cumings, Mrs. Joh...|female|38.0| 1| 0| PC 17599|71.2833| C85| C|\n",
17+
"| 3| 1| 3|Heikkinen, Miss. ...|female|26.0| 0| 0|STON/O2. 3101282| 7.925| null| S|\n",
18+
"+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+\n",
19+
"only showing top 3 rows\n",
20+
"\n"
21+
]
22+
}
23+
],
24+
"source": [
25+
"from pyspark.sql import SparkSession\n",
26+
"from pyspark.ml.classification import LogisticRegression\n",
27+
"\n",
28+
"spark = SparkSession.builder.appName('titanic_logreg').getOrCreate()\n",
29+
"df = spark.read.csv('titanic.csv', inferSchema = True, header = True)\n",
30+
"df.show(3)"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 7,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"name": "stdout",
40+
"output_type": "stream",
41+
"text": [
42+
"root\n",
43+
" |-- PassengerId: integer (nullable = true)\n",
44+
" |-- Survived: integer (nullable = true)\n",
45+
" |-- Pclass: integer (nullable = true)\n",
46+
" |-- Name: string (nullable = true)\n",
47+
" |-- Sex: string (nullable = true)\n",
48+
" |-- Age: double (nullable = true)\n",
49+
" |-- SibSp: integer (nullable = true)\n",
50+
" |-- Parch: integer (nullable = true)\n",
51+
" |-- Ticket: string (nullable = true)\n",
52+
" |-- Fare: double (nullable = true)\n",
53+
" |-- Cabin: string (nullable = true)\n",
54+
" |-- Embarked: string (nullable = true)\n",
55+
"\n"
56+
]
57+
}
58+
],
59+
"source": [
60+
"df.printSchema()"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 8,
66+
"metadata": {},
67+
"outputs": [
68+
{
69+
"data": {
70+
"text/plain": [
71+
"['PassengerId',\n",
72+
" 'Survived',\n",
73+
" 'Pclass',\n",
74+
" 'Name',\n",
75+
" 'Sex',\n",
76+
" 'Age',\n",
77+
" 'SibSp',\n",
78+
" 'Parch',\n",
79+
" 'Ticket',\n",
80+
" 'Fare',\n",
81+
" 'Cabin',\n",
82+
" 'Embarked']"
83+
]
84+
},
85+
"execution_count": 8,
86+
"metadata": {},
87+
"output_type": "execute_result"
88+
}
89+
],
90+
"source": [
91+
"df.columns"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": 9,
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"my_col = df.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 10,
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"final_data = my_col.na.drop()"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": 11,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"from pyspark.ml.feature import (VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder)\n",
119+
"\n",
120+
"gender_indexer = StringIndexer(inputCol = 'Sex', outputCol = 'SexIndex')\n",
121+
"gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol = 'SexVec')"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": 12,
127+
"metadata": {},
128+
"outputs": [],
129+
"source": [
130+
"embark_indexer = StringIndexer(inputCol = 'Embarked', outputCol = 'EmbarkIndex')\n",
131+
"embark_encoder = OneHotEncoder(inputCol = 'EmbarkIndex', outputCol = 'EmbarkVec')"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": 13,
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"assembler = VectorAssembler(inputCols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkVec'], outputCol = 'features')"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 14,
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"from pyspark.ml import Pipeline\n",
150+
"\n",
151+
"log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'Survived')"
152+
]
153+
},
154+
{
155+
"cell_type": "code",
156+
"execution_count": 15,
157+
"metadata": {},
158+
"outputs": [],
159+
"source": [
160+
"pipeline = Pipeline(stages = [gender_indexer, embark_indexer, \n",
161+
" gender_encoder, embark_encoder,\n",
162+
" assembler, log_reg])"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 16,
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"train, test = final_data.randomSplit([0.7, 0.3])"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": 17,
177+
"metadata": {},
178+
"outputs": [],
179+
"source": [
180+
"fit_model = pipeline.fit(train)"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": 18,
186+
"metadata": {},
187+
"outputs": [],
188+
"source": [
189+
"results = fit_model.transform(test)"
190+
]
191+
},
192+
{
193+
"cell_type": "code",
194+
"execution_count": 20,
195+
"metadata": {},
196+
"outputs": [
197+
{
198+
"name": "stdout",
199+
"output_type": "stream",
200+
"text": [
201+
"+----------+--------+\n",
202+
"|prediction|Survived|\n",
203+
"+----------+--------+\n",
204+
"| 1.0| 0|\n",
205+
"| 1.0| 0|\n",
206+
"| 0.0| 0|\n",
207+
"+----------+--------+\n",
208+
"only showing top 3 rows\n",
209+
"\n"
210+
]
211+
}
212+
],
213+
"source": [
214+
"results.select('prediction', 'Survived').show(3)"
215+
]
216+
},
217+
{
218+
"cell_type": "code",
219+
"execution_count": 21,
220+
"metadata": {},
221+
"outputs": [
222+
{
223+
"data": {
224+
"text/plain": [
225+
"0.7851091867469879"
226+
]
227+
},
228+
"execution_count": 21,
229+
"metadata": {},
230+
"output_type": "execute_result"
231+
}
232+
],
233+
"source": [
234+
"from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
235+
"\n",
236+
"eval = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived')\n",
237+
"AUC = eval.evaluate(results)\n",
238+
"AUC"
239+
]
240+
}
241+
],
242+
"metadata": {
243+
"kernelspec": {
244+
"display_name": "conda_python3",
245+
"language": "python",
246+
"name": "conda_python3"
247+
},
248+
"language_info": {
249+
"codemirror_mode": {
250+
"name": "ipython",
251+
"version": 3
252+
},
253+
"file_extension": ".py",
254+
"mimetype": "text/x-python",
255+
"name": "python",
256+
"nbconvert_exporter": "python",
257+
"pygments_lexer": "ipython3",
258+
"version": "3.6.4"
259+
}
260+
},
261+
"nbformat": 4,
262+
"nbformat_minor": 2
263+
}

0 commit comments

Comments
 (0)