Skip to content

Commit 5a08a64

Browse files
committed
mainly recover done
1 parent 0a78aaf commit 5a08a64

File tree

2 files changed

+214
-0
lines changed

2 files changed

+214
-0
lines changed
28.3 KB
Binary file not shown.

named-entity-recognition/test.ipynb

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"source": [
6+
"## basics"
7+
],
8+
"metadata": {
9+
"collapsed": false
10+
},
11+
"id": "25f8626cab6817c8"
12+
},
13+
{
14+
"cell_type": "code",
15+
"outputs": [],
16+
"source": [
17+
"import transformers\n",
18+
"from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig\n",
19+
"import torch\n",
20+
"import pandas as pd"
21+
],
22+
"metadata": {
23+
"collapsed": false,
24+
"ExecuteTime": {
25+
"end_time": "2024-03-08T09:36:47.796581Z",
26+
"start_time": "2024-03-08T09:36:46.945726Z"
27+
}
28+
},
29+
"id": "7f0e5ae5c024c461",
30+
"execution_count": 1
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"source": [
35+
"##### get data"
36+
],
37+
"metadata": {
38+
"collapsed": false
39+
},
40+
"id": "7933ae0171e249e"
41+
},
42+
{
43+
"cell_type": "code",
44+
"outputs": [],
45+
"source": [
46+
"data = pd.read_excel('sorted_data.xlsx')\n",
47+
"# print(data.head())\n",
48+
"# print(data.columns.tolist())\n",
49+
"data = data['Free_text'].tolist()\n"
50+
],
51+
"metadata": {
52+
"collapsed": false,
53+
"ExecuteTime": {
54+
"end_time": "2024-03-08T09:37:09.367308Z",
55+
"start_time": "2024-03-08T09:37:09.354349Z"
56+
}
57+
},
58+
"id": "e3e5bb19464c8b66",
59+
"execution_count": 6
60+
},
61+
{
62+
"cell_type": "markdown",
63+
"source": [
64+
"##### get model and tokenizer"
65+
],
66+
"metadata": {
67+
"collapsed": false
68+
},
69+
"id": "ae852fc09bb6848f"
70+
},
71+
{
72+
"cell_type": "code",
73+
"outputs": [],
74+
"source": [
75+
"model = AutoModelForTokenClassification.from_pretrained('./output/LOCAL')\n",
76+
"tokenizer = AutoTokenizer.from_pretrained('./output/LOCAL')"
77+
],
78+
"metadata": {
79+
"collapsed": false,
80+
"ExecuteTime": {
81+
"end_time": "2024-03-08T09:36:51.859405Z",
82+
"start_time": "2024-03-08T09:36:51.249490Z"
83+
}
84+
},
85+
"id": "25e8ae479df83166",
86+
"execution_count": 3
87+
},
88+
{
89+
"cell_type": "code",
90+
"outputs": [
91+
{
92+
"data": {
93+
"text/plain": "{'input_ids': tensor([[ 101, 1185, 1227, 1155, 1200, 19310, 122, 119, 13093, 8050,\n 26601, 8766, 16430, 7903, 1114, 185, 1513, 12602, 1105, 3575,\n 1899, 1116, 113, 189, 3491, 2539, 110, 117, 1899, 185,\n 23415, 7301, 4527, 117, 2393, 1377, 178, 1324, 1665, 1344,\n 1133, 3489, 4366, 117, 174, 1403, 2087, 1197, 4252, 1320,\n 1406, 27914, 113, 19137, 168, 3135, 1571, 20581, 1604, 119,\n 124, 114, 185, 119, 170, 1559, 1545, 1495, 168, 194,\n 1559, 22433, 4935, 2087, 4426, 4490, 114, 114, 1106, 185,\n 1513, 12602, 117, 20844, 5815, 1105, 2394, 2050, 14196, 15029,\n 118, 170, 8057, 6105, 13292, 1121, 128, 120, 1429, 120,\n 1406, 113, 13753, 3549, 1106, 1476, 1306, 1403, 1113, 1765,\n 120, 1429, 120, 1406, 114, 118, 172, 1204, 2728, 16337,\n 1571, 1775, 18202, 24400, 1336, 17881, 1477, 135, 174, 3329,\n 130, 120, 130, 120, 1659, 118, 5952, 1200, 18202, 16480,\n 1559, 1659, 120, 130, 120, 1659, 135, 174, 3329, 130,\n 120, 122, 120, 1695, 118, 2393, 4060, 1777, 1610, 4043,\n 1643, 16236, 1394, 1121, 1406, 120, 122, 120, 1695, 1106,\n 1743, 120, 125, 120, 1695, 118, 192, 1830, 3740, 179,\n 10038, 17881, 1495, 118, 176, 5521, 6617, 1777, 16405, 1610,\n 4043, 1643, 16236, 1394, 1121, 127, 120, 128, 120, 1695,\n 1106, 1429, 120, 122, 120, 1572, 123, 119, 184, 8997,\n 1811, 172, 6834, 1204, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}"
94+
},
95+
"execution_count": 7,
96+
"metadata": {},
97+
"output_type": "execute_result"
98+
}
99+
],
100+
"source": [
101+
"sentence = data[0]\n",
102+
"tokenized_data = tokenizer(sentence, return_tensors='pt')\n",
103+
"tokenized_data"
104+
],
105+
"metadata": {
106+
"collapsed": false,
107+
"ExecuteTime": {
108+
"end_time": "2024-03-08T09:39:13.527003Z",
109+
"start_time": "2024-03-08T09:39:13.520415Z"
110+
}
111+
},
112+
"id": "b5c152a3428a6c26",
113+
"execution_count": 7
114+
},
115+
{
116+
"cell_type": "code",
117+
"outputs": [
118+
{
119+
"data": {
120+
"text/plain": "TokenClassifierOutput(loss=None, logits=tensor([[[ 0.3010, 0.2078, -1.3391, ..., 0.1134, -0.0254, -1.2513],\n [ 4.5767, -1.4675, -2.1649, ..., -0.7354, 0.6201, -1.0496],\n [ 3.5306, -1.3734, -1.3194, ..., -0.3207, 0.9904, -0.8301],\n ...,\n [ 1.6698, -0.2334, -1.4185, ..., -0.9062, 1.3581, -0.1205],\n [ 1.4431, -0.0586, -1.2554, ..., -0.6175, 0.3569, -1.0394],\n [-2.6067, -0.9548, -0.9691, ..., 1.9412, 0.1351, 1.0405]]],\n grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)"
121+
},
122+
"execution_count": 8,
123+
"metadata": {},
124+
"output_type": "execute_result"
125+
}
126+
],
127+
"source": [
128+
"output = model(**tokenized_data)\n",
129+
"output"
130+
],
131+
"metadata": {
132+
"collapsed": false,
133+
"ExecuteTime": {
134+
"end_time": "2024-03-08T09:39:15.380730Z",
135+
"start_time": "2024-03-08T09:39:15.210929Z"
136+
}
137+
},
138+
"id": "af6ce4f241f6ca13",
139+
"execution_count": 8
140+
},
141+
{
142+
"cell_type": "code",
143+
"outputs": [],
144+
"source": [
145+
"predictions = torch.argmax(output.logits, dim=2)\n",
146+
"predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]"
147+
],
148+
"metadata": {
149+
"collapsed": false,
150+
"ExecuteTime": {
151+
"end_time": "2024-03-08T09:42:34.536653Z",
152+
"start_time": "2024-03-08T09:42:34.523534Z"
153+
}
154+
},
155+
"id": "9d1ede37d979192c",
156+
"execution_count": 9
157+
},
158+
{
159+
"cell_type": "code",
160+
"outputs": [
161+
{
162+
"data": {
163+
"text/plain": "['B-G-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-P-bio',\n 'B-P-bio',\n 'B-D-bio',\n 'I-D-bio',\n 'I-D-bio',\n 'I-D-bio',\n 'I-D-bio',\n 'I-D-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-D-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'B-S-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'B-S-bio',\n 'B-S-bio',\n 'B-D-bio',\n 'O',\n 'B-S-bio',\n 'O',\n 'O',\n 'B-S-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'O',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-D-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-S-bio',\n 'O',\n 'O',\n 'B-S-bio',\n 'O',\n 'O',\n 'O',\n 'B-G-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-G-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'I-G-bio',\n 'O',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'O',\n 'O',\n 'B-P-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'I-G-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-P-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'B-P-bio',\n 'B-D-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-S-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-S-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'B-P-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-G-bio',\n 'B-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'I-P-bio',\n 'B-P-bio',\n 'B-D-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-S-bio',\n 'O',\n 'O',\n 'O',\n 'O',\n 'O',\n 'B-G-bio',\n 'B-P-bio',\n 'O',\n 'O',\n 'B-A-bio',\n 'O',\n 'O',\n 'O',\n 'I-G-bio']"
164+
},
165+
"execution_count": 10,
166+
"metadata": {},
167+
"output_type": "execute_result"
168+
}
169+
],
170+
"source": [
171+
"predicted_token_class"
172+
],
173+
"metadata": {
174+
"collapsed": false,
175+
"ExecuteTime": {
176+
"end_time": "2024-03-08T09:42:39.935793Z",
177+
"start_time": "2024-03-08T09:42:39.931404Z"
178+
}
179+
},
180+
"id": "8395e45b064ca4c",
181+
"execution_count": 10
182+
},
183+
{
184+
"cell_type": "code",
185+
"outputs": [],
186+
"source": [],
187+
"metadata": {
188+
"collapsed": false
189+
},
190+
"id": "dc3216a709d68fe8"
191+
}
192+
],
193+
"metadata": {
194+
"kernelspec": {
195+
"name": "openai",
196+
"language": "python",
197+
"display_name": "openai"
198+
},
199+
"language_info": {
200+
"codemirror_mode": {
201+
"name": "ipython",
202+
"version": 2
203+
},
204+
"file_extension": ".py",
205+
"mimetype": "text/x-python",
206+
"name": "python",
207+
"nbconvert_exporter": "python",
208+
"pygments_lexer": "ipython2",
209+
"version": "2.7.6"
210+
}
211+
},
212+
"nbformat": 4,
213+
"nbformat_minor": 5
214+
}

0 commit comments

Comments
 (0)