|
7 | 7 | "collapsed": true,
|
8 | 8 | "jupyter": {
|
9 | 9 | "outputs_hidden": true
|
10 |
| - }, |
11 |
| - "ExecuteTime": { |
12 |
| - "end_time": "2024-07-31T12:57:37.296030Z", |
13 |
| - "start_time": "2024-07-31T12:57:37.292368Z" |
14 | 10 | }
|
15 | 11 | },
|
16 | 12 | "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
|
17 | 13 | "outputs": [],
|
18 |
| - "execution_count": 1 |
| 14 | + "execution_count": null |
19 | 15 | },
|
20 | 16 | {
|
21 | 17 | "cell_type": "code",
|
22 | 18 | "id": "3b1e507015ba6b81",
|
23 |
| - "metadata": { |
24 |
| - "ExecuteTime": { |
25 |
| - "end_time": "2024-07-31T12:57:37.317651Z", |
26 |
| - "start_time": "2024-07-31T12:57:37.313808Z" |
27 |
| - } |
28 |
| - }, |
| 19 | + "metadata": {}, |
29 | 20 | "source": [
|
30 | 21 | "with open('input.txt', 'r', encoding='utf-8') as f:\n",
|
31 | 22 | " text = f.read()"
|
32 | 23 | ],
|
33 | 24 | "outputs": [],
|
34 |
| - "execution_count": 2 |
| 25 | + "execution_count": null |
35 | 26 | },
|
36 | 27 | {
|
37 | 28 | "cell_type": "code",
|
38 | 29 | "id": "ac8e51ae5bbfcae7",
|
39 |
| - "metadata": { |
40 |
| - "ExecuteTime": { |
41 |
| - "end_time": "2024-07-31T12:57:40.488939Z", |
42 |
| - "start_time": "2024-07-31T12:57:37.319486Z" |
43 |
| - } |
44 |
| - }, |
| 30 | + "metadata": {}, |
45 | 31 | "source": [
|
46 | 32 | "from transformers import AutoTokenizer\n",
|
47 | 33 | "\n",
|
48 | 34 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
|
49 | 35 | "\n",
|
50 | 36 | "tokens = tokenizer.encode(text, add_special_tokens=False)"
|
51 | 37 | ],
|
52 |
| - "outputs": [ |
53 |
| - { |
54 |
| - "name": "stderr", |
55 |
| - "output_type": "stream", |
56 |
| - "text": [ |
57 |
| - "Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors\n" |
58 |
| - ] |
59 |
| - } |
60 |
| - ], |
61 |
| - "execution_count": 3 |
| 38 | + "outputs": [], |
| 39 | + "execution_count": null |
62 | 40 | },
|
63 | 41 | {
|
64 | 42 | "cell_type": "code",
|
65 | 43 | "id": "aeefcdf813e427e",
|
66 |
| - "metadata": { |
67 |
| - "ExecuteTime": { |
68 |
| - "end_time": "2024-07-31T12:57:40.495510Z", |
69 |
| - "start_time": "2024-07-31T12:57:40.490341Z" |
70 |
| - } |
71 |
| - }, |
| 44 | + "metadata": {}, |
72 | 45 | "source": [
|
73 | 46 | "context_length = 512\n",
|
74 | 47 | "batch_size = 2"
|
75 | 48 | ],
|
76 | 49 | "outputs": [],
|
77 |
| - "execution_count": 4 |
| 50 | + "execution_count": null |
78 | 51 | },
|
79 | 52 | {
|
80 | 53 | "cell_type": "code",
|
81 | 54 | "id": "a384b42274f008a2",
|
82 |
| - "metadata": { |
83 |
| - "ExecuteTime": { |
84 |
| - "end_time": "2024-07-31T12:57:40.522050Z", |
85 |
| - "start_time": "2024-07-31T12:57:40.496842Z" |
86 |
| - } |
87 |
| - }, |
| 55 | + "metadata": {}, |
88 | 56 | "source": [
|
89 | 57 | "num_batches = len(tokens) // (batch_size * context_length)\n",
|
90 | 58 | "tokens = tokens[:num_batches * batch_size * context_length]"
|
91 | 59 | ],
|
92 | 60 | "outputs": [],
|
93 |
| - "execution_count": 5 |
| 61 | + "execution_count": null |
94 | 62 | },
|
95 | 63 | {
|
96 | 64 | "cell_type": "code",
|
97 | 65 | "id": "5c4cc78ac1a02c1d",
|
98 |
| - "metadata": { |
99 |
| - "ExecuteTime": { |
100 |
| - "end_time": "2024-07-31T12:57:40.592272Z", |
101 |
| - "start_time": "2024-07-31T12:57:40.524063Z" |
102 |
| - } |
103 |
| - }, |
| 66 | + "metadata": {}, |
104 | 67 | "source": [
|
105 | 68 | "import torch\n",
|
106 | 69 | "\n",
|
107 | 70 | "input_ids = torch.tensor(tokens).view(-1, context_length)"
|
108 | 71 | ],
|
109 | 72 | "outputs": [],
|
110 |
| - "execution_count": 6 |
| 73 | + "execution_count": null |
111 | 74 | },
|
112 | 75 | {
|
113 | 76 | "cell_type": "code",
|
114 | 77 | "id": "7037fd75e2161382",
|
115 |
| - "metadata": { |
116 |
| - "ExecuteTime": { |
117 |
| - "end_time": "2024-07-31T12:57:40.601199Z", |
118 |
| - "start_time": "2024-07-31T12:57:40.593250Z" |
119 |
| - } |
120 |
| - }, |
| 78 | + "metadata": {}, |
121 | 79 | "source": [
|
122 | 80 | "from torch.utils.data import DataLoader, TensorDataset\n",
|
123 | 81 | "from torch.optim import Adam\n",
|
|
137 | 95 | "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)"
|
138 | 96 | ],
|
139 | 97 | "outputs": [],
|
140 |
| - "execution_count": 7 |
| 98 | + "execution_count": null |
141 | 99 | },
|
142 | 100 | {
|
143 | 101 | "cell_type": "code",
|
144 | 102 | "id": "a98b7baa064b8494",
|
145 |
| - "metadata": { |
146 |
| - "ExecuteTime": { |
147 |
| - "end_time": "2024-07-31T12:57:41.577878Z", |
148 |
| - "start_time": "2024-07-31T12:57:40.602187Z" |
149 |
| - } |
150 |
| - }, |
| 103 | + "metadata": {}, |
151 | 104 | "source": [
|
152 | 105 | "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
|
153 | 106 | "\n",
|
|
157 | 110 | "_ = model.load_state_dict(state_dict, strict=False)"
|
158 | 111 | ],
|
159 | 112 | "outputs": [],
|
160 |
| - "execution_count": 8 |
| 113 | + "execution_count": null |
161 | 114 | },
|
162 | 115 | {
|
163 |
| - "metadata": { |
164 |
| - "ExecuteTime": { |
165 |
| - "end_time": "2024-07-31T12:57:43.098187Z", |
166 |
| - "start_time": "2024-07-31T12:57:41.578713Z" |
167 |
| - } |
168 |
| - }, |
| 116 | + "metadata": {}, |
169 | 117 | "cell_type": "code",
|
170 | 118 | "source": [
|
171 | 119 | "device = \"cuda\"\n",
|
172 | 120 | "model = model.to(device=\"cuda\")"
|
173 | 121 | ],
|
174 | 122 | "id": "2e0fa8b3082df716",
|
175 | 123 | "outputs": [],
|
176 |
| - "execution_count": 9 |
| 124 | + "execution_count": null |
177 | 125 | },
|
178 | 126 | {
|
179 | 127 | "cell_type": "code",
|
180 | 128 | "id": "e2f5076894770740",
|
181 |
| - "metadata": { |
182 |
| - "ExecuteTime": { |
183 |
| - "end_time": "2024-07-31T12:57:57.044755Z", |
184 |
| - "start_time": "2024-07-31T12:57:43.099050Z" |
185 |
| - } |
186 |
| - }, |
| 129 | + "metadata": {}, |
187 | 130 | "source": [
|
188 | 131 | "from labml import tracker, experiment\n",
|
189 | 132 | "\n",
|
|
236 | 179 | "\n",
|
237 | 180 | "print(\"Training complete.\")"
|
238 | 181 | ],
|
239 |
| - "outputs": [ |
240 |
| - { |
241 |
| - "data": { |
242 |
| - "text/plain": [ |
243 |
| - "<IPython.core.display.HTML object>" |
244 |
| - ], |
245 |
| - "text/html": [ |
246 |
| - "<pre style=\"overflow-x: scroll;\">\n", |
247 |
| - "<strong><span style=\"text-decoration: underline\">LoRA.GPT2</span></strong>: <span style=\"color: #208FFB\">7a14822c4f3c11efad8354ef33f17c7c</span>\n", |
248 |
| - "\t[dirty]: <strong><span style=\"color: #DDB62B\">\"training loop\"</span></strong>\n", |
249 |
| - "<span style=\"color: #208FFB\">Monitor experiment at </span><a href='http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c' target='blank'>http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c</a>\n", |
250 |
| - "<strong><span style=\"color: #DDB62B\">Still updating labml server, please wait for it to complete...</span></strong></pre>" |
251 |
| - ] |
252 |
| - }, |
253 |
| - "metadata": {}, |
254 |
| - "output_type": "display_data" |
255 |
| - }, |
256 |
| - { |
257 |
| - "ename": "KeyboardInterrupt", |
258 |
| - "evalue": "", |
259 |
| - "output_type": "error", |
260 |
| - "traceback": [ |
261 |
| - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", |
262 |
| - "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", |
263 |
| - "Cell \u001B[0;32mIn[10], line 25\u001B[0m\n\u001B[1;32m 22\u001B[0m loss \u001B[38;5;241m=\u001B[39m criterion(shift_logits\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m, shift_logits\u001B[38;5;241m.\u001B[39msize(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)), shift_labels\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m))\n\u001B[1;32m 24\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[0;32m---> 25\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m 26\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n\u001B[1;32m 28\u001B[0m tracker\u001B[38;5;241m.\u001B[39msave(step, {\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m: loss})\n", |
264 |
| - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:521\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m 511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 512\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m 513\u001B[0m Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m 514\u001B[0m (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 519\u001B[0m inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m 520\u001B[0m )\n\u001B[0;32m--> 521\u001B[0m torch\u001B[38;5;241m.\u001B[39mautograd\u001B[38;5;241m.\u001B[39mbackward(\n\u001B[1;32m 522\u001B[0m \u001B[38;5;28mself\u001B[39m, gradient, retain_graph, create_graph, inputs\u001B[38;5;241m=\u001B[39minputs\n\u001B[1;32m 523\u001B[0m )\n", |
265 |
| - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:289\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m 284\u001B[0m retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m 286\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m 287\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m 288\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 289\u001B[0m _engine_run_backward(\n\u001B[1;32m 290\u001B[0m tensors,\n\u001B[1;32m 291\u001B[0m grad_tensors_,\n\u001B[1;32m 292\u001B[0m retain_graph,\n\u001B[1;32m 293\u001B[0m create_graph,\n\u001B[1;32m 294\u001B[0m inputs,\n\u001B[1;32m 295\u001B[0m allow_unreachable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 296\u001B[0m accumulate_grad\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 297\u001B[0m )\n", |
266 |
| - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:768\u001B[0m, in \u001B[0;36m_engine_run_backward\u001B[0;34m(t_outputs, *args, **kwargs)\u001B[0m\n\u001B[1;32m 766\u001B[0m unregister_hooks \u001B[38;5;241m=\u001B[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001B[1;32m 767\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 768\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m Variable\u001B[38;5;241m.\u001B[39m_execution_engine\u001B[38;5;241m.\u001B[39mrun_backward( \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 769\u001B[0m t_outputs, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[1;32m 770\u001B[0m ) \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 771\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 772\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m attach_logging_hooks:\n", |
267 |
| - "\u001B[0;31mKeyboardInterrupt\u001B[0m: " |
268 |
| - ] |
269 |
| - } |
270 |
| - ], |
271 |
| - "execution_count": 10 |
| 182 | + "outputs": [], |
| 183 | + "execution_count": null |
272 | 184 | },
|
273 | 185 | {
|
274 | 186 | "cell_type": "code",
|
275 | 187 | "id": "da2d4023002648dc",
|
276 |
| - "metadata": { |
277 |
| - "ExecuteTime": { |
278 |
| - "end_time": "2024-07-31T12:57:57.046254Z", |
279 |
| - "start_time": "2024-07-31T12:57:57.045954Z" |
280 |
| - } |
281 |
| - }, |
| 188 | + "metadata": {}, |
282 | 189 | "source": [],
|
283 | 190 | "outputs": [],
|
284 | 191 | "execution_count": null
|
|
0 commit comments