|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Author: Angela Chapman |
| 4 | +# Date: 8/6/2014 |
| 5 | +# |
| 6 | +# This file contains code to accompany the Kaggle tutorial |
| 7 | +# "Deep learning goes to the movies". The code in this file |
| 8 | +# is for Part 1 of the tutorial on Natural Language Processing. |
| 9 | +# |
| 10 | +# *************************************** # |
| 11 | + |
| 12 | +import os |
| 13 | +from sklearn.feature_extraction.text import CountVectorizer |
| 14 | +from sklearn.ensemble import RandomForestClassifier |
| 15 | +from KaggleWord2VecUtility import KaggleWord2VecUtility |
| 16 | +from sklearn.feature_extraction.text import TfidfVectorizer |
| 17 | +from sklearn import cross_validation |
| 18 | +from sklearn.datasets import make_multilabel_classification |
| 19 | +import pandas as pd |
| 20 | +import numpy as np |
| 21 | +import json |
| 22 | +import sys |
| 23 | +import time |
| 24 | + |
| 25 | +from collections import defaultdict |
| 26 | +from sklearn.cluster import KMeans |
| 27 | +from numpy.random import RandomState |
| 28 | +from sklearn.ensemble import RandomForestClassifier |
| 29 | +from sklearn.linear_model import LogisticRegression |
| 30 | +from sklearn.linear_model import SGDClassifier |
| 31 | +rng = RandomState(42) |
| 32 | + |
| 33 | +from sklearn.decomposition import PCA |
| 34 | +from sklearn.decomposition import SparsePCA |
| 35 | +from sklearn.multiclass import OneVsRestClassifier |
| 36 | +from sklearn.svm import SVC |
| 37 | +from sklearn.svm import LinearSVC |
| 38 | +from sklearn.naive_bayes import GaussianNB |
| 39 | +from sklearn.naive_bayes import MultinomialNB |
| 40 | + |
| 41 | +from sklearn.metrics import f1_score |
| 42 | + |
| 43 | +if __name__ == '__main__': |
| 44 | + input1=sys.argv[1] # train data |
| 45 | + input2=sys.argv[2] # test data |
| 46 | + tagdic=sys.argv[3] # dictionary for tag 2000 |
| 47 | + |
| 48 | + train=json.load(open(input1)) |
| 49 | + test=json.load(open(input2)) |
| 50 | + |
| 51 | + tag_dic=json.load(open(tagdic)) |
| 52 | + |
| 53 | + |
| 54 | + # Initialize an empty list to hold the clean reviews |
| 55 | + traindata = [] |
| 56 | + testdata = [] |
| 57 | + |
| 58 | + Y1=[] |
| 59 | + Y2=[] |
| 60 | + |
| 61 | + # Loop over each review; create an index i that goes from 0 to the length |
| 62 | + # of the movie review list |
| 63 | + |
| 64 | + |
| 65 | + for i in train: |
| 66 | + buf=[] |
| 67 | + traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0]+train[i][1], True))) |
| 68 | + for j in train[i][3].split(): |
| 69 | + if j in tag_dic: |
| 70 | + buf.append(tag_dic[j]) |
| 71 | + Y1.append(buf) |
| 72 | + |
| 73 | + |
| 74 | + |
| 75 | + for i in test: |
| 76 | + buf=[] |
| 77 | + testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0]+test[i][1], True))) |
| 78 | + for j in test[i][3].split(): |
| 79 | + if j in tag_dic: |
| 80 | + buf.append(tag_dic[j]) |
| 81 | + Y2.append(buf) |
| 82 | + # ****** Create a bag of words from the training set |
| 83 | + # |
| 84 | + |
| 85 | + |
| 86 | + # Initialize the "CountVectorizer" object, which is scikit-learn's |
| 87 | + # Tfidf tool. |
| 88 | + vectorizer = CountVectorizer(min_df=0.001) |
| 89 | + |
| 90 | + |
| 91 | + X_all=traindata+testdata |
| 92 | + lentrain=len(traindata) |
| 93 | + |
| 94 | + X = vectorizer.fit_transform(X_all) |
| 95 | + |
| 96 | + X_train = X[:lentrain] |
| 97 | + X_test = X[lentrain:] |
| 98 | + |
| 99 | + X1 = X_train.toarray() |
| 100 | + X2 = X_test.toarray() |
| 101 | + |
| 102 | +# X1 = X_train |
| 103 | +# X2 = X_test |
| 104 | + |
| 105 | +# clf = GaussianNB() |
| 106 | +# clf=SGDClassifier() |
| 107 | + clf=LinearSVC(random_state=0) |
| 108 | +# clf=RandomForestClassifier(n_estimators = 100) |
| 109 | + # clf=MultinomialNB() |
| 110 | + |
| 111 | + |
| 112 | + classif = OneVsRestClassifier(clf).fit(X1, Y1) |
| 113 | + class_set=classif.classes_ |
| 114 | + scores=classif.decision_function(X2) |
| 115 | + Y3=[] |
| 116 | + # predict=classif.predict(X2) |
| 117 | + |
| 118 | + if len(scores.shape) == 1: |
| 119 | + indices = (scores > 0).astype(np.int) |
| 120 | + else: |
| 121 | + for score in scores: |
| 122 | + buf=[] |
| 123 | + for i in range(9): |
| 124 | + if score[i]>0: |
| 125 | + buf.append(class_set[i]) |
| 126 | + if not buf: |
| 127 | + indices = np.argmax(score) |
| 128 | + |
| 129 | + buf.append(class_set[indices]) |
| 130 | + Y3.append(buf) |
| 131 | + |
| 132 | + for i in Y3: |
| 133 | + print(i) |
| 134 | + print(f1_score(Y3,Y2)) |
0 commit comments