Skip to content

Commit b68acb4

Browse files
authored
Notebook fixed and cleaned (#1726)
* Notebook fixed and cleaned * Comment reformatted
1 parent 96f5109 commit b68acb4

File tree

1 file changed

+74
-118
lines changed

1 file changed

+74
-118
lines changed

introduction_to_amazon_algorithms/blazingtext_word2vec_text8/blazingtext_word2vec_text8.ipynb

Lines changed: 74 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434
"## Setup\n",
3535
"\n",
3636
"Let's start by specifying:\n",
37-
"\n",
38-
"- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n",
37+
"- The S3 buckets and prefixes that you want to use for saving model data and where training data is located. These should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n",
3938
"- The IAM role ARN used to give SageMaker access to your data. It can be fetched using the **get_execution_role** method from sagemaker python SDK."
4039
]
4140
},
@@ -55,11 +54,14 @@
5554
"sess = sagemaker.Session()\n",
5655
"\n",
5756
"role = get_execution_role()\n",
58-
"print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n",
57+
"print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n",
58+
"\n",
59+
"output_bucket = sess.default_bucket() # Replace with your own bucket name if needed\n",
60+
"print(output_bucket)\n",
61+
"output_prefix = \"sagemaker/DEMO-blazingtext-text8\" # Replace with the prefix under which you want to store the data if needed\n",
5962
"\n",
60-
"bucket = sess.default_bucket() # Replace with your own bucket name if needed\n",
61-
"print(bucket)\n",
62-
"prefix = 'sagemaker/DEMO-blazingtext-text8' #Replace with the prefix under which you want to store the data if needed"
63+
"data_bucket = \"penny-cache-alpha-us-west-2\" # Replace with the bucket where your data is located\n",
64+
"data_prefix = \"1p-notebooks/data/text8\""
6365
]
6466
},
6567
{
@@ -68,9 +70,7 @@
6870
"source": [
6971
"### Data Ingestion\n",
7072
"\n",
71-
"Next, we download a dataset from the web on which we want to train the word vectors. BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence.\n",
72-
"\n",
73-
"In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. "
73+
"BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence. In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. Data is already downloaded from [here](http://mattmahoney.net/dc/text8.zip), uncompressed and stored in a S3 bucket. "
7474
]
7575
},
7676
{
@@ -79,41 +79,9 @@
7979
"metadata": {},
8080
"outputs": [],
8181
"source": [
82-
"!wget http://mattmahoney.net/dc/text8.zip -O text8.gz"
83-
]
84-
},
85-
{
86-
"cell_type": "code",
87-
"execution_count": null,
88-
"metadata": {
89-
"collapsed": true
90-
},
91-
"outputs": [],
92-
"source": [
93-
"# Uncompressing\n",
94-
"!gzip -d text8.gz -f"
95-
]
96-
},
97-
{
98-
"cell_type": "markdown",
99-
"metadata": {},
100-
"source": [
101-
"After the data downloading and uncompressing is complete, we need to upload it to S3 so that it can be consumed by SageMaker to execute training jobs. We'll use Python SDK to upload these two files to the bucket and prefix location that we have set above."
102-
]
103-
},
104-
{
105-
"cell_type": "code",
106-
"execution_count": null,
107-
"metadata": {
108-
"collapsed": true
109-
},
110-
"outputs": [],
111-
"source": [
112-
"train_channel = prefix + '/train'\n",
113-
"\n",
114-
"sess.upload_data(path='text8', bucket=bucket, key_prefix=train_channel)\n",
82+
"train_channel = f\"{data_prefix}/train\"\n",
11583
"\n",
116-
"s3_train_data = 's3://{}/{}'.format(bucket, train_channel)"
84+
"s3_train_data = f\"s3://{data_bucket}/{train_channel}\""
11785
]
11886
},
11987
{
@@ -126,12 +94,10 @@
12694
{
12795
"cell_type": "code",
12896
"execution_count": null,
129-
"metadata": {
130-
"collapsed": true
131-
},
97+
"metadata": {},
13298
"outputs": [],
13399
"source": [
134-
"s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)"
100+
"s3_output_location = f\"s3://{output_bucket}/{output_prefix}/output\""
135101
]
136102
},
137103
{
@@ -145,9 +111,7 @@
145111
{
146112
"cell_type": "code",
147113
"execution_count": null,
148-
"metadata": {
149-
"collapsed": true
150-
},
114+
"metadata": {},
151115
"outputs": [],
152116
"source": [
153117
"region_name = boto3.Session().region_name"
@@ -160,7 +124,7 @@
160124
"outputs": [],
161125
"source": [
162126
"container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, \"blazingtext\", \"latest\")\n",
163-
"print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))"
127+
"print(f\"Using SageMaker BlazingText container: {container} ({region_name})\")"
164128
]
165129
},
166130
{
@@ -211,20 +175,20 @@
211175
{
212176
"cell_type": "code",
213177
"execution_count": null,
214-
"metadata": {
215-
"collapsed": true
216-
},
178+
"metadata": {},
217179
"outputs": [],
218180
"source": [
219-
"bt_model = sagemaker.estimator.Estimator(container,\n",
220-
" role, \n",
221-
" train_instance_count=2, \n",
222-
" train_instance_type='ml.c4.2xlarge',\n",
223-
" train_volume_size = 5,\n",
224-
" train_max_run = 360000,\n",
225-
" input_mode= 'File',\n",
226-
" output_path=s3_output_location,\n",
227-
" sagemaker_session=sess)"
181+
"bt_model = sagemaker.estimator.Estimator(\n",
182+
" container,\n",
183+
" role,\n",
184+
" train_instance_count=2,\n",
185+
" train_instance_type=\"ml.c4.2xlarge\",\n",
186+
" train_volume_size=5,\n",
187+
" train_max_run=360000,\n",
188+
" input_mode=\"File\",\n",
189+
" output_path=s3_output_location,\n",
190+
" sagemaker_session=sess,\n",
191+
")"
228192
]
229193
},
230194
{
@@ -237,22 +201,22 @@
237201
{
238202
"cell_type": "code",
239203
"execution_count": null,
240-
"metadata": {
241-
"collapsed": true
242-
},
204+
"metadata": {},
243205
"outputs": [],
244206
"source": [
245-
"bt_model.set_hyperparameters(mode=\"batch_skipgram\",\n",
246-
" epochs=5,\n",
247-
" min_count=5,\n",
248-
" sampling_threshold=0.0001,\n",
249-
" learning_rate=0.05,\n",
250-
" window_size=5,\n",
251-
" vector_dim=100,\n",
252-
" negative_samples=5,\n",
253-
" batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n",
254-
" evaluation=True,# Perform similarity evaluation on WS-353 dataset at the end of training\n",
255-
" subwords=False) # Subword embedding learning is not supported by batch_skipgram"
207+
"bt_model.set_hyperparameters(\n",
208+
" mode=\"batch_skipgram\",\n",
209+
" epochs=5,\n",
210+
" min_count=5,\n",
211+
" sampling_threshold=0.0001,\n",
212+
" learning_rate=0.05,\n",
213+
" window_size=5,\n",
214+
" vector_dim=100,\n",
215+
" negative_samples=5,\n",
216+
" batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n",
217+
" evaluation=True, # Perform similarity evaluation on WS-353 dataset at the end of training\n",
218+
" subwords=False,\n",
219+
") # Subword embedding learning is not supported by batch_skipgram"
256220
]
257221
},
258222
{
@@ -265,14 +229,13 @@
265229
{
266230
"cell_type": "code",
267231
"execution_count": null,
268-
"metadata": {
269-
"collapsed": true
270-
},
232+
"metadata": {},
271233
"outputs": [],
272234
"source": [
273-
"train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', \n",
274-
" content_type='text/plain', s3_data_type='S3Prefix')\n",
275-
"data_channels = {'train': train_data}"
235+
"train_data = sagemaker.session.s3_input(\n",
236+
" s3_train_data, distribution=\"FullyReplicated\", content_type=\"text/plain\", s3_data_type=\"S3Prefix\"\n",
237+
")\n",
238+
"data_channels = {\"train\": train_data}"
276239
]
277240
},
278241
{
@@ -307,7 +270,7 @@
307270
"metadata": {},
308271
"outputs": [],
309272
"source": [
310-
"bt_endpoint = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')"
273+
"bt_endpoint = bt_model.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
311274
]
312275
},
313276
{
@@ -328,16 +291,16 @@
328291
{
329292
"cell_type": "code",
330293
"execution_count": null,
331-
"metadata": {
332-
"collapsed": true
333-
},
294+
"metadata": {},
334295
"outputs": [],
335296
"source": [
336297
"words = [\"awesome\", \"blazing\"]\n",
337298
"\n",
338-
"payload = {\"instances\" : words}\n",
299+
"payload = {\"instances\": words}\n",
339300
"\n",
340-
"response = bt_endpoint.predict(json.dumps(payload))\n",
301+
"response = bt_endpoint.predict(\n",
302+
" json.dumps(payload), initial_args={\"ContentType\": \"application/json\", \"Accept\": \"application/json\"}\n",
303+
")\n",
341304
"\n",
342305
"vecs = json.loads(response)\n",
343306
"print(vecs)"
@@ -367,15 +330,13 @@
367330
{
368331
"cell_type": "code",
369332
"execution_count": null,
370-
"metadata": {
371-
"collapsed": true
372-
},
333+
"metadata": {},
373334
"outputs": [],
374335
"source": [
375-
"s3 = boto3.resource('s3')\n",
336+
"s3 = boto3.resource(\"s3\")\n",
376337
"\n",
377-
"key = bt_model.model_data[bt_model.model_data.find(\"/\", 5)+1:]\n",
378-
"s3.Bucket(bucket).download_file(key, 'model.tar.gz')"
338+
"key = bt_model.model_data[bt_model.model_data.find(\"/\", 5) + 1 :]\n",
339+
"s3.Bucket(output_bucket).download_file(key, \"model.tar.gz\")"
379340
]
380341
},
381342
{
@@ -410,9 +371,7 @@
410371
{
411372
"cell_type": "code",
412373
"execution_count": null,
413-
"metadata": {
414-
"collapsed": true
415-
},
374+
"metadata": {},
416375
"outputs": [],
417376
"source": [
418377
"!cat eval.json"
@@ -428,9 +387,7 @@
428387
{
429388
"cell_type": "code",
430389
"execution_count": null,
431-
"metadata": {
432-
"collapsed": true
433-
},
390+
"metadata": {},
434391
"outputs": [],
435392
"source": [
436393
"import numpy as np\n",
@@ -441,7 +398,7 @@
441398
"\n",
442399
"first_line = True\n",
443400
"index_to_word = []\n",
444-
"with open(\"vectors.txt\",\"r\") as f:\n",
401+
"with open(\"vectors.txt\", \"r\") as f:\n",
445402
" for line_num, line in enumerate(f):\n",
446403
" if first_line:\n",
447404
" dim = int(line.strip().split()[1])\n",
@@ -450,7 +407,7 @@
450407
" continue\n",
451408
" line = line.strip()\n",
452409
" word = line.split()[0]\n",
453-
" vec = word_vecs[line_num-1]\n",
410+
" vec = word_vecs[line_num - 1]\n",
454411
" for index, vec_val in enumerate(line.split()[1:]):\n",
455412
" vec[index] = float(vec_val)\n",
456413
" index_to_word.append(word)\n",
@@ -462,14 +419,12 @@
462419
{
463420
"cell_type": "code",
464421
"execution_count": null,
465-
"metadata": {
466-
"collapsed": true
467-
},
422+
"metadata": {},
468423
"outputs": [],
469424
"source": [
470425
"from sklearn.manifold import TSNE\n",
471426
"\n",
472-
"tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=10000)\n",
427+
"tsne = TSNE(perplexity=40, n_components=2, init=\"pca\", n_iter=10000)\n",
473428
"two_d_embeddings = tsne.fit_transform(word_vecs[:num_points])\n",
474429
"labels = index_to_word[:num_points]"
475430
]
@@ -484,14 +439,16 @@
484439
"%matplotlib inline\n",
485440
"\n",
486441
"def plot(embeddings, labels):\n",
487-
" pylab.figure(figsize=(20,20))\n",
442+
" pylab.figure(figsize=(20, 20))\n",
488443
" for i, label in enumerate(labels):\n",
489-
" x, y = embeddings[i,:]\n",
444+
" x, y = embeddings[i, :]\n",
490445
" pylab.scatter(x, y)\n",
491-
" pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',\n",
492-
" ha='right', va='bottom')\n",
446+
" pylab.annotate(\n",
447+
" label, xy=(x, y), xytext=(5, 2), textcoords=\"offset points\", ha=\"right\", va=\"bottom\"\n",
448+
" )\n",
493449
" pylab.show()\n",
494450
"\n",
451+
"\n",
495452
"plot(two_d_embeddings, labels)"
496453
]
497454
},
@@ -520,20 +477,19 @@
520477
{
521478
"cell_type": "code",
522479
"execution_count": null,
523-
"metadata": {
524-
"collapsed": true
525-
},
480+
"metadata": {},
526481
"outputs": [],
527482
"source": [
528483
"sess.delete_endpoint(bt_endpoint.endpoint)"
529484
]
530485
}
531486
],
532487
"metadata": {
488+
"instance_type": "ml.t3.medium",
533489
"kernelspec": {
534-
"display_name": "conda_python3",
490+
"display_name": "Python 3 (Data Science)",
535491
"language": "python",
536-
"name": "conda_python3"
492+
"name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0"
537493
},
538494
"language_info": {
539495
"codemirror_mode": {
@@ -545,10 +501,10 @@
545501
"name": "python",
546502
"nbconvert_exporter": "python",
547503
"pygments_lexer": "ipython3",
548-
"version": "3.6.2"
504+
"version": "3.7.6"
549505
},
550506
"notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
551507
},
552508
"nbformat": 4,
553-
"nbformat_minor": 2
509+
"nbformat_minor": 4
554510
}

0 commit comments

Comments
 (0)