amazon project

l2k2 · l2k2 · commit 8e0fb9d2aef5 · 2018-09-13T04:56:11.000Z
diff --git a/amazon-reviews/amazon-bow.py b/amazon-reviews/amazon-bow.py
@@ -0,0 +1,30 @@
+import amazon
+import numpy as np
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation
+from keras.layers import Embedding, LSTM
+from keras.layers import Conv1D, Flatten
+from keras.preprocessing import text
+import wandb
+from wandb.keras import WandbCallback
+
+wandb.init()
+config = wandb.config
+config.vocab_size = 1000
+
+(train_summary, train_review_text, train_labels), (test_summary, test_review_text, test_labels) = amazon.load_amazon()
+
+tokenizer = text.Tokenizer(num_words=config.vocab_size)
+tokenizer.fit_on_texts(train_review_text)
+X_train = tokenizer.texts_to_matrix(train_review_text)
+X_test = tokenizer.texts_to_matrix(test_review_text)
+
+# Build the model
+model = Sequential()
+model.add(Dense(1, activation='softmax', input_shape=(config.vocab_size,)))
+
+model.compile(loss='binary_crossentropy',
+              optimizer='adam',
+              metrics=['accuracy'])
+model.fit(X_train, train_labels, epochs=10, validation_data=(X_test, test_labels),
+    callbacks=[WandbCallback()])
diff --git a/amazon-reviews/amazon.py b/amazon-reviews/amazon.py
@@ -0,0 +1,41 @@
+import os
+import json
+
+def load_amazon():
+    filename = 'reviews_Video_Games_5.json'
+    train_summary = []
+    train_review_text = []
+    train_labels = []
+    
+    test_summary = []
+    test_review_text = []
+    test_labels = []
+    
+    with open(filename, 'r') as f:
+        for (i, line) in enumerate(f):
+            data = json.loads(line)
+            
+            if data['overall'] == 3:
+                next
+            elif data['overall'] == 4 or data['overall'] == 5:
+                label = 1
+            elif data['overall'] == 1 or data['overall'] == 2:
+                label = 0
+            else:
+                raise Exception("Unexpected value " + str(data['overall']))
+                
+            summary = data['summary']
+            review_text = data['reviewText']
+            
+            if (i % 10 == 0):
+                test_summary.append(summary)
+                test_review_text.append(review_text)
+                test_labels.append(label)
+            else:
+                train_summary.append(summary)
+                train_review_text.append(review_text)
+                train_labels.append(label)
+                
+    return (train_summary, train_review_text, train_labels), (test_summary, test_review_text, test_labels)
+
+load_amazon()
diff --git a/amazon-reviews/download-amazon.sh b/amazon-reviews/download-amazon.sh
@@ -0,0 +1,2 @@
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games_5.json.gz 
+gunzip xvfz reviews_Video_Games_5.json.gz
diff --git a/amazon-reviews/wandb/settings b/amazon-reviews/wandb/settings
@@ -0,0 +1,4 @@
+[default]
+entity: qualcomm
+project: amazon-sep13
+base_url: https://api.wandb.ai

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games_5.json.gz`
	`2`	`+gunzip xvfz reviews_Video_Games_5.json.gz`