|
| 1 | +# need to download glove from http://nlp.stanford.edu/data/glove.6B.zip |
| 2 | +# wget http://nlp.stanford.edu/data/glove.6B.zip |
| 3 | +# unzip http://nlp.stanford.edu/data/glove.6B.zip |
| 4 | + |
| 5 | +from keras.preprocessing import sequence |
| 6 | +from keras.models import Sequential |
| 7 | +from keras.layers import Dense, Dropout, Activation |
| 8 | +from keras.layers import Embedding, LSTM |
| 9 | +from keras.layers import Conv1D, Flatten |
| 10 | +from keras.datasets import imdb |
| 11 | +import wandb |
| 12 | +from wandb.keras import WandbCallback |
| 13 | +import imdb |
| 14 | +import numpy as np |
| 15 | +from keras.preprocessing import text |
| 16 | + |
| 17 | +wandb.init() |
| 18 | +config = wandb.config |
| 19 | + |
| 20 | +# set parameters: |
| 21 | +config.vocab_size = 1000 |
| 22 | +config.maxlen = 300 |
| 23 | +config.batch_size = 32 |
| 24 | +config.embedding_dims = 50 |
| 25 | +config.filters = 250 |
| 26 | +config.kernel_size = 3 |
| 27 | +config.hidden_dims = 100 |
| 28 | +config.epochs = 10 |
| 29 | + |
| 30 | +(X_train, y_train), (X_test, y_test) = imdb.load_imdb() |
| 31 | + |
| 32 | +tokenizer = text.Tokenizer(num_words=config.vocab_size) |
| 33 | +tokenizer.fit_on_texts(X_train) |
| 34 | +X_train = tokenizer.texts_to_matrix(X_train) |
| 35 | +X_test = tokenizer.texts_to_matrix(X_test) |
| 36 | + |
| 37 | +X_train = sequence.pad_sequences(X_train, maxlen=config.maxlen) |
| 38 | +X_test = sequence.pad_sequences(X_test, maxlen=config.maxlen) |
| 39 | + |
| 40 | +embeddings_index = dict() |
| 41 | +f = open('glove.6B.100d.txt') |
| 42 | +for line in f: |
| 43 | + values = line.split() |
| 44 | + word = values[0] |
| 45 | + coefs = np.asarray(values[1:], dtype='float32') |
| 46 | + embeddings_index[word] = coefs |
| 47 | +f.close() |
| 48 | + |
| 49 | +embedding_matrix = np.zeros((config.vocab_size, 100)) |
| 50 | +for word, index in tokenizer.word_index.items(): |
| 51 | + if index > config.vocab_size - 1: |
| 52 | + break |
| 53 | + else: |
| 54 | + embedding_vector = embeddings_index.get(word) |
| 55 | + if embedding_vector is not None: |
| 56 | + embedding_matrix[index] = embedding_vector |
| 57 | + |
| 58 | + |
| 59 | +## create model |
| 60 | +model = Sequential() |
| 61 | +model.add(Embedding(config.vocab_size, 100, input_length=config.maxlen, weights=[embedding_matrix], trainable=False)) |
| 62 | +model.add(LSTM(config.hidden_dims, activation="sigmoid")) |
| 63 | +model.add(Dense(1, activation='sigmoid')) |
| 64 | +model.compile(loss='binary_crossentropy', |
| 65 | + optimizer='rmsprop', |
| 66 | + metrics=['accuracy']) |
| 67 | + |
| 68 | +model.fit(X_train, y_train, |
| 69 | + batch_size=config.batch_size, |
| 70 | + epochs=config.epochs, |
| 71 | + validation_data=(X_test, y_test), callbacks=[WandbCallback()]) |
| 72 | + |
| 73 | + |
| 74 | + |
| 75 | + |
0 commit comments