Skip to content

Commit 078147b

Browse files
Updates to Chapter 4 notebooks
1 parent 76aa4af commit 078147b

File tree

3 files changed

+739
-314
lines changed

3 files changed

+739
-314
lines changed

chapter_4/0401_tf-idf_Gensim.ipynb

Lines changed: 51 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "code",
55
"execution_count": 1,
66
"metadata": {
7-
"collapsed": true
7+
"tags": []
88
},
99
"outputs": [],
1010
"source": [
@@ -15,7 +15,7 @@
1515
"cell_type": "code",
1616
"execution_count": 2,
1717
"metadata": {
18-
"collapsed": true
18+
"tags": []
1919
},
2020
"outputs": [],
2121
"source": [
@@ -31,7 +31,7 @@
3131
"cell_type": "code",
3232
"execution_count": 3,
3333
"metadata": {
34-
"collapsed": true
34+
"tags": []
3535
},
3636
"outputs": [],
3737
"source": [
@@ -43,7 +43,7 @@
4343
"cell_type": "code",
4444
"execution_count": 4,
4545
"metadata": {
46-
"collapsed": true
46+
"tags": []
4747
},
4848
"outputs": [],
4949
"source": [
@@ -82,38 +82,38 @@
8282
"output_type": "stream",
8383
"text": [
8484
"Num words in dictionary: 32\n",
85-
"27 This\n",
86-
"20 That\n",
87-
"21 as\n",
88-
"8 ran\n",
85+
"0 .\n",
86+
"1 I\n",
87+
"2 love\n",
88+
"3 tacos\n",
89+
"4 She\n",
90+
"5 chicken\n",
91+
"6 ran\n",
8992
"7 the\n",
93+
"8 with\n",
94+
"9 The\n",
95+
"10 a\n",
96+
"11 choose\n",
97+
"12 chooses\n",
98+
"13 do\n",
9099
"14 me\n",
91-
"15 do\n",
92-
"12 The\n",
93-
"28 pizza\n",
100+
"15 n't\n",
101+
"16 nap\n",
102+
"17 take\n",
103+
"18 to\n",
104+
"19 That\n",
105+
"20 as\n",
106+
"21 cream\n",
107+
"22 ice\n",
94108
"23 is\n",
95-
"11 n't\n",
96-
"0 love\n",
97-
"29 affront\n",
98-
"17 to\n",
99-
"1 .\n",
100-
"25 ice\n",
101-
"18 chooses\n",
102-
"19 nice\n",
103-
"9 choose\n",
104-
"6 She\n",
105-
"24 cream\n",
106-
"22 man\n",
107-
"16 a\n",
108-
"2 tacos\n",
109-
"3 I\n",
110-
"4 chicken\n",
111-
"5 with\n",
112-
"10 nap\n",
109+
"24 man\n",
110+
"25 nice\n",
111+
"26 pie\n",
112+
"27 This\n",
113+
"28 affront\n",
114+
"29 an\n",
113115
"30 nature\n",
114-
"13 take\n",
115-
"31 an\n",
116-
"26 pie\n"
116+
"31 pizza\n"
117117
]
118118
}
119119
],
@@ -155,7 +155,7 @@
155155
"name": "stdout",
156156
"output_type": "stream",
157157
"text": [
158-
"8\n"
158+
"6\n"
159159
]
160160
}
161161
],
@@ -172,7 +172,7 @@
172172
"name": "stdout",
173173
"output_type": "stream",
174174
"text": [
175-
"[(0, 3), (2, 1), (3, 1)]\n"
175+
"[(1, 1), (2, 3), (3, 1)]\n"
176176
]
177177
}
178178
],
@@ -194,7 +194,7 @@
194194
"name": "stdout",
195195
"output_type": "stream",
196196
"text": [
197-
"[[(0, 1), (1, 1), (2, 1), (3, 1)], [(1, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(1, 2), (3, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(1, 1), (5, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(1, 1), (17, 1), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]\n"
197+
"[[(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 2), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1)], [(0, 1), (8, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(0, 1), (18, 1), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]\n"
198198
]
199199
}
200200
],
@@ -236,7 +236,7 @@
236236
"text": [
237237
"['I', 'love', 'tacos', '.']\n",
238238
"[(0, 1), (1, 1), (2, 1), (3, 1)]\n",
239-
"[(0, 0.6559486886294514), (2, 0.6559486886294514), (3, 0.37344696513776354)]\n"
239+
"[(1, 0.37344696513776354), (2, 0.6559486886294514), (3, 0.6559486886294514)]\n"
240240
]
241241
}
242242
],
@@ -260,8 +260,8 @@
260260
"name": "stdout",
261261
"output_type": "stream",
262262
"text": [
263-
"[(0, 1), (1, 1), (3, 1), (28, 1)]\n",
264-
"[(0, 0.6559486886294514), (3, 0.37344696513776354), (28, 0.6559486886294514)]\n"
263+
"[(0, 1), (1, 1), (2, 1), (31, 1)]\n",
264+
"[(1, 0.37344696513776354), (2, 0.6559486886294514), (31, 0.6559486886294514)]\n"
265265
]
266266
}
267267
],
@@ -282,15 +282,19 @@
282282
"name": "stdout",
283283
"output_type": "stream",
284284
"text": [
285-
"Similarity index with 5 documents in 0 shards (stored under ~/Documents/nlp-python/similarity)\n"
285+
"Similarity index with 5 documents in 0 shards (stored under output/similarity)\n"
286286
]
287287
}
288288
],
289289
"source": [
290290
"# Create similarity measure object in tf-idf space\n",
291291
"# First arg is temp external storage\n",
292292
"# https://radimrehurek.com/gensim/similarities/docsim.html\n",
293-
"sims = gensim.similarities.Similarity('~/Documents/nlp-python/similarity', tf_idf[corpus],\n",
293+
"import os\n",
294+
"os.makedirs('output', exist_ok=True)\n",
295+
"\n",
296+
"output_obj = os.path.join('output', 'similarity')\n",
297+
"sims = gensim.similarities.Similarity(output_obj, tf_idf[corpus],\n",
294298
" num_features=len(dictionary))\n",
295299
"print(sims)"
296300
]
@@ -305,8 +309,8 @@
305309
"output_type": "stream",
306310
"text": [
307311
"['chicken', 'with', 'taco', 'love']\n",
308-
"[(0, 1), (4, 1), (5, 1)]\n",
309-
"[(0, 0.6559486886294514), (4, 0.6559486886294514), (5, 0.37344696513776354)]\n"
312+
"[(2, 1), (5, 1), (8, 1)]\n",
313+
"[(2, 0.6559486886294514), (5, 0.6559486886294514), (8, 0.37344696513776354)]\n"
310314
]
311315
}
312316
],
@@ -328,7 +332,8 @@
328332
{
329333
"data": {
330334
"text/plain": [
331-
"array([ 0.4302687 , 0.41768694, 0. , 0.07687882, 0. ], dtype=float32)"
335+
"array([0.4302687 , 0.41768694, 0. , 0.07687882, 0. ],\n",
336+
" dtype=float32)"
332337
]
333338
},
334339
"execution_count": 16,
@@ -339,20 +344,11 @@
339344
"source": [
340345
"sims[query_doc_tf_idf]"
341346
]
342-
},
343-
{
344-
"cell_type": "code",
345-
"execution_count": null,
346-
"metadata": {
347-
"collapsed": true
348-
},
349-
"outputs": [],
350-
"source": []
351347
}
352348
],
353349
"metadata": {
354350
"kernelspec": {
355-
"display_name": "Python 3",
351+
"display_name": "Python 3 (ipykernel)",
356352
"language": "python",
357353
"name": "python3"
358354
},
@@ -366,9 +362,9 @@
366362
"name": "python",
367363
"nbconvert_exporter": "python",
368364
"pygments_lexer": "ipython3",
369-
"version": "3.5.2"
365+
"version": "3.8.10"
370366
}
371367
},
372368
"nbformat": 4,
373-
"nbformat_minor": 2
369+
"nbformat_minor": 4
374370
}

0 commit comments

Comments
 (0)