|
4 | 4 | "cell_type": "code", |
5 | 5 | "execution_count": 1, |
6 | 6 | "metadata": { |
7 | | - "collapsed": true |
| 7 | + "tags": [] |
8 | 8 | }, |
9 | 9 | "outputs": [], |
10 | 10 | "source": [ |
|
15 | 15 | "cell_type": "code", |
16 | 16 | "execution_count": 2, |
17 | 17 | "metadata": { |
18 | | - "collapsed": true |
| 18 | + "tags": [] |
19 | 19 | }, |
20 | 20 | "outputs": [], |
21 | 21 | "source": [ |
|
31 | 31 | "cell_type": "code", |
32 | 32 | "execution_count": 3, |
33 | 33 | "metadata": { |
34 | | - "collapsed": true |
| 34 | + "tags": [] |
35 | 35 | }, |
36 | 36 | "outputs": [], |
37 | 37 | "source": [ |
|
43 | 43 | "cell_type": "code", |
44 | 44 | "execution_count": 4, |
45 | 45 | "metadata": { |
46 | | - "collapsed": true |
| 46 | + "tags": [] |
47 | 47 | }, |
48 | 48 | "outputs": [], |
49 | 49 | "source": [ |
|
82 | 82 | "output_type": "stream", |
83 | 83 | "text": [ |
84 | 84 | "Num words in dictionary: 32\n", |
85 | | - "27 This\n", |
86 | | - "20 That\n", |
87 | | - "21 as\n", |
88 | | - "8 ran\n", |
| 85 | + "0 .\n", |
| 86 | + "1 I\n", |
| 87 | + "2 love\n", |
| 88 | + "3 tacos\n", |
| 89 | + "4 She\n", |
| 90 | + "5 chicken\n", |
| 91 | + "6 ran\n", |
89 | 92 | "7 the\n", |
| 93 | + "8 with\n", |
| 94 | + "9 The\n", |
| 95 | + "10 a\n", |
| 96 | + "11 choose\n", |
| 97 | + "12 chooses\n", |
| 98 | + "13 do\n", |
90 | 99 | "14 me\n", |
91 | | - "15 do\n", |
92 | | - "12 The\n", |
93 | | - "28 pizza\n", |
| 100 | + "15 n't\n", |
| 101 | + "16 nap\n", |
| 102 | + "17 take\n", |
| 103 | + "18 to\n", |
| 104 | + "19 That\n", |
| 105 | + "20 as\n", |
| 106 | + "21 cream\n", |
| 107 | + "22 ice\n", |
94 | 108 | "23 is\n", |
95 | | - "11 n't\n", |
96 | | - "0 love\n", |
97 | | - "29 affront\n", |
98 | | - "17 to\n", |
99 | | - "1 .\n", |
100 | | - "25 ice\n", |
101 | | - "18 chooses\n", |
102 | | - "19 nice\n", |
103 | | - "9 choose\n", |
104 | | - "6 She\n", |
105 | | - "24 cream\n", |
106 | | - "22 man\n", |
107 | | - "16 a\n", |
108 | | - "2 tacos\n", |
109 | | - "3 I\n", |
110 | | - "4 chicken\n", |
111 | | - "5 with\n", |
112 | | - "10 nap\n", |
| 109 | + "24 man\n", |
| 110 | + "25 nice\n", |
| 111 | + "26 pie\n", |
| 112 | + "27 This\n", |
| 113 | + "28 affront\n", |
| 114 | + "29 an\n", |
113 | 115 | "30 nature\n", |
114 | | - "13 take\n", |
115 | | - "31 an\n", |
116 | | - "26 pie\n" |
| 116 | + "31 pizza\n" |
117 | 117 | ] |
118 | 118 | } |
119 | 119 | ], |
|
155 | 155 | "name": "stdout", |
156 | 156 | "output_type": "stream", |
157 | 157 | "text": [ |
158 | | - "8\n" |
| 158 | + "6\n" |
159 | 159 | ] |
160 | 160 | } |
161 | 161 | ], |
|
172 | 172 | "name": "stdout", |
173 | 173 | "output_type": "stream", |
174 | 174 | "text": [ |
175 | | - "[(0, 3), (2, 1), (3, 1)]\n" |
| 175 | + "[(1, 1), (2, 3), (3, 1)]\n" |
176 | 176 | ] |
177 | 177 | } |
178 | 178 | ], |
|
194 | 194 | "name": "stdout", |
195 | 195 | "output_type": "stream", |
196 | 196 | "text": [ |
197 | | - "[[(0, 1), (1, 1), (2, 1), (3, 1)], [(1, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(1, 2), (3, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(1, 1), (5, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(1, 1), (17, 1), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]\n" |
| 197 | + "[[(0, 1), (1, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 2), (1, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1)], [(0, 1), (8, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)], [(0, 1), (18, 1), (23, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)]]\n" |
198 | 198 | ] |
199 | 199 | } |
200 | 200 | ], |
|
236 | 236 | "text": [ |
237 | 237 | "['I', 'love', 'tacos', '.']\n", |
238 | 238 | "[(0, 1), (1, 1), (2, 1), (3, 1)]\n", |
239 | | - "[(0, 0.6559486886294514), (2, 0.6559486886294514), (3, 0.37344696513776354)]\n" |
| 239 | + "[(1, 0.37344696513776354), (2, 0.6559486886294514), (3, 0.6559486886294514)]\n" |
240 | 240 | ] |
241 | 241 | } |
242 | 242 | ], |
|
260 | 260 | "name": "stdout", |
261 | 261 | "output_type": "stream", |
262 | 262 | "text": [ |
263 | | - "[(0, 1), (1, 1), (3, 1), (28, 1)]\n", |
264 | | - "[(0, 0.6559486886294514), (3, 0.37344696513776354), (28, 0.6559486886294514)]\n" |
| 263 | + "[(0, 1), (1, 1), (2, 1), (31, 1)]\n", |
| 264 | + "[(1, 0.37344696513776354), (2, 0.6559486886294514), (31, 0.6559486886294514)]\n" |
265 | 265 | ] |
266 | 266 | } |
267 | 267 | ], |
|
282 | 282 | "name": "stdout", |
283 | 283 | "output_type": "stream", |
284 | 284 | "text": [ |
285 | | - "Similarity index with 5 documents in 0 shards (stored under ~/Documents/nlp-python/similarity)\n" |
| 285 | + "Similarity index with 5 documents in 0 shards (stored under output/similarity)\n" |
286 | 286 | ] |
287 | 287 | } |
288 | 288 | ], |
289 | 289 | "source": [ |
290 | 290 | "# Create similarity measure object in tf-idf space\n", |
291 | 291 | "# First arg is temp external storage\n", |
292 | 292 | "# https://radimrehurek.com/gensim/similarities/docsim.html\n", |
293 | | - "sims = gensim.similarities.Similarity('~/Documents/nlp-python/similarity', tf_idf[corpus],\n", |
| 293 | + "import os\n", |
| 294 | + "os.makedirs('output', exist_ok=True)\n", |
| 295 | + "\n", |
| 296 | + "output_obj = os.path.join('output', 'similarity')\n", |
| 297 | + "sims = gensim.similarities.Similarity(output_obj, tf_idf[corpus],\n", |
294 | 298 | " num_features=len(dictionary))\n", |
295 | 299 | "print(sims)" |
296 | 300 | ] |
|
305 | 309 | "output_type": "stream", |
306 | 310 | "text": [ |
307 | 311 | "['chicken', 'with', 'taco', 'love']\n", |
308 | | - "[(0, 1), (4, 1), (5, 1)]\n", |
309 | | - "[(0, 0.6559486886294514), (4, 0.6559486886294514), (5, 0.37344696513776354)]\n" |
| 312 | + "[(2, 1), (5, 1), (8, 1)]\n", |
| 313 | + "[(2, 0.6559486886294514), (5, 0.6559486886294514), (8, 0.37344696513776354)]\n" |
310 | 314 | ] |
311 | 315 | } |
312 | 316 | ], |
|
328 | 332 | { |
329 | 333 | "data": { |
330 | 334 | "text/plain": [ |
331 | | - "array([ 0.4302687 , 0.41768694, 0. , 0.07687882, 0. ], dtype=float32)" |
| 335 | + "array([0.4302687 , 0.41768694, 0. , 0.07687882, 0. ],\n", |
| 336 | + " dtype=float32)" |
332 | 337 | ] |
333 | 338 | }, |
334 | 339 | "execution_count": 16, |
|
339 | 344 | "source": [ |
340 | 345 | "sims[query_doc_tf_idf]" |
341 | 346 | ] |
342 | | - }, |
343 | | - { |
344 | | - "cell_type": "code", |
345 | | - "execution_count": null, |
346 | | - "metadata": { |
347 | | - "collapsed": true |
348 | | - }, |
349 | | - "outputs": [], |
350 | | - "source": [] |
351 | 347 | } |
352 | 348 | ], |
353 | 349 | "metadata": { |
354 | 350 | "kernelspec": { |
355 | | - "display_name": "Python 3", |
| 351 | + "display_name": "Python 3 (ipykernel)", |
356 | 352 | "language": "python", |
357 | 353 | "name": "python3" |
358 | 354 | }, |
|
366 | 362 | "name": "python", |
367 | 363 | "nbconvert_exporter": "python", |
368 | 364 | "pygments_lexer": "ipython3", |
369 | | - "version": "3.5.2" |
| 365 | + "version": "3.8.10" |
370 | 366 | } |
371 | 367 | }, |
372 | 368 | "nbformat": 4, |
373 | | - "nbformat_minor": 2 |
| 369 | + "nbformat_minor": 4 |
374 | 370 | } |
0 commit comments