Skip to content

Commit 36abee1

Browse files
committed
Merge branch 'master' of github.com:weichenzhao/CS544_Project into HEAD
2 parents 77991cc + a9887f6 commit 36abee1

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed

proba.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env python
2+
3+
# Author: Angela Chapman
4+
# Date: 8/6/2014
5+
#
6+
# This file contains code to accompany the Kaggle tutorial
7+
# "Deep learning goes to the movies". The code in this file
8+
# is for Part 1 of the tutorial on Natural Language Processing.
9+
#
10+
# *************************************** #
11+
12+
import os
13+
from sklearn.feature_extraction.text import CountVectorizer
14+
from sklearn.ensemble import RandomForestClassifier
15+
from KaggleWord2VecUtility import KaggleWord2VecUtility
16+
from sklearn.feature_extraction.text import TfidfVectorizer
17+
from sklearn import cross_validation
18+
from sklearn.datasets import make_multilabel_classification
19+
import pandas as pd
20+
import numpy as np
21+
import json
22+
import sys
23+
import time
24+
25+
from collections import defaultdict
26+
from sklearn.cluster import KMeans
27+
from numpy.random import RandomState
28+
from sklearn.ensemble import RandomForestClassifier
29+
from sklearn.linear_model import LogisticRegression
30+
from sklearn.linear_model import SGDClassifier
31+
rng = RandomState(42)
32+
33+
from sklearn.decomposition import PCA
34+
from sklearn.decomposition import SparsePCA
35+
from sklearn.multiclass import OneVsRestClassifier
36+
from sklearn.svm import SVC
37+
from sklearn.svm import LinearSVC
38+
from sklearn.naive_bayes import GaussianNB
39+
from sklearn.naive_bayes import MultinomialNB
40+
41+
from sklearn.metrics import f1_score
42+
43+
if __name__ == '__main__':
44+
input1=sys.argv[1] # train data
45+
input2=sys.argv[2] # test data
46+
tagdic=sys.argv[3] # dictionary for tag 2000
47+
48+
train=json.load(open(input1))
49+
test=json.load(open(input2))
50+
51+
tag_dic=json.load(open(tagdic))
52+
53+
54+
# Initialize an empty list to hold the clean reviews
55+
traindata = []
56+
testdata = []
57+
58+
Y1=[]
59+
Y2=[]
60+
61+
# Loop over each review; create an index i that goes from 0 to the length
62+
# of the movie review list
63+
64+
65+
for i in train:
66+
buf=[]
67+
traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train[i][0]+train[i][1], True)))
68+
for j in train[i][3].split():
69+
if j in tag_dic:
70+
buf.append(tag_dic[j])
71+
Y1.append(buf)
72+
73+
74+
75+
for i in test:
76+
buf=[]
77+
testdata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(test[i][0]+test[i][1], True)))
78+
for j in test[i][3].split():
79+
if j in tag_dic:
80+
buf.append(tag_dic[j])
81+
Y2.append(buf)
82+
# ****** Create a bag of words from the training set
83+
#
84+
85+
86+
# Initialize the "CountVectorizer" object, which is scikit-learn's
87+
# Tfidf tool.
88+
vectorizer = CountVectorizer(min_df=0.001)
89+
90+
91+
X_all=traindata+testdata
92+
lentrain=len(traindata)
93+
94+
X = vectorizer.fit_transform(X_all)
95+
96+
X_train = X[:lentrain]
97+
X_test = X[lentrain:]
98+
99+
X1 = X_train.toarray()
100+
X2 = X_test.toarray()
101+
102+
# X1 = X_train
103+
# X2 = X_test
104+
105+
# clf = GaussianNB()
106+
# clf=SGDClassifier()
107+
clf=LinearSVC(random_state=0)
108+
# clf=RandomForestClassifier(n_estimators = 100)
109+
# clf=MultinomialNB()
110+
111+
112+
classif = OneVsRestClassifier(clf).fit(X1, Y1)
113+
class_set=classif.classes_
114+
scores=classif.decision_function(X2)
115+
Y3=[]
116+
# predict=classif.predict(X2)
117+
118+
if len(scores.shape) == 1:
119+
indices = (scores > 0).astype(np.int)
120+
else:
121+
for score in scores:
122+
buf=[]
123+
for i in range(9):
124+
if score[i]>0:
125+
buf.append(class_set[i])
126+
if not buf:
127+
indices = np.argmax(score)
128+
129+
buf.append(class_set[indices])
130+
Y3.append(buf)
131+
132+
for i in Y3:
133+
print(i)
134+
print(f1_score(Y3,Y2))

0 commit comments

Comments
 (0)