Skip to content

Commit dd19e79

Browse files
author
l2k2
committed
Added embedding
1 parent 2f5b222 commit dd19e79

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

keras-imdb/imdb-embedding.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# need to download glove from http://nlp.stanford.edu/data/glove.6B.zip
2+
# wget http://nlp.stanford.edu/data/glove.6B.zip
3+
# unzip http://nlp.stanford.edu/data/glove.6B.zip
4+
5+
from keras.preprocessing import sequence
6+
from keras.models import Sequential
7+
from keras.layers import Dense, Dropout, Activation
8+
from keras.layers import Embedding, LSTM
9+
from keras.layers import Conv1D, Flatten
10+
from keras.datasets import imdb
11+
import wandb
12+
from wandb.keras import WandbCallback
13+
import imdb
14+
import numpy as np
15+
from keras.preprocessing import text
16+
17+
wandb.init()
18+
config = wandb.config
19+
20+
# set parameters:
21+
config.vocab_size = 1000
22+
config.maxlen = 300
23+
config.batch_size = 32
24+
config.embedding_dims = 50
25+
config.filters = 250
26+
config.kernel_size = 3
27+
config.hidden_dims = 100
28+
config.epochs = 10
29+
30+
(X_train, y_train), (X_test, y_test) = imdb.load_imdb()
31+
32+
tokenizer = text.Tokenizer(num_words=config.vocab_size)
33+
tokenizer.fit_on_texts(X_train)
34+
X_train = tokenizer.texts_to_matrix(X_train)
35+
X_test = tokenizer.texts_to_matrix(X_test)
36+
37+
X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen)
38+
X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen)
39+
40+
embeddings_index = dict()
41+
f = open('glove.6B.100d.txt')
42+
for line in f:
43+
values = line.split()
44+
word = values[0]
45+
coefs = np.asarray(values[1:], dtype='float32')
46+
embeddings_index[word] = coefs
47+
f.close()
48+
49+
embedding_matrix = np.zeros((config.vocab_size, 100))
50+
for word, index in tokenizer.word_index.items():
51+
if index > config.vocab_size - 1:
52+
break
53+
else:
54+
embedding_vector = embeddings_index.get(word)
55+
if embedding_vector is not None:
56+
embedding_matrix[index] = embedding_vector
57+
58+
59+
## create model
60+
model = Sequential()
61+
model.add(Embedding(config.vocab_size, 100, input_length=config.maxlen, weights=[embedding_matrix], trainable=False))
62+
model.add(LSTM(config.hidden_dims, activation="sigmoid"))
63+
model.add(Dense(1, activation='sigmoid'))
64+
model.compile(loss='binary_crossentropy',
65+
optimizer='rmsprop',
66+
metrics=['accuracy'])
67+
68+
model.fit(X_train, y_train,
69+
batch_size=config.batch_size,
70+
epochs=config.epochs,
71+
validation_data=(X_test, y_test), callbacks=[WandbCallback()])
72+
73+
74+
75+

0 commit comments

Comments
 (0)