@@ -50,7 +50,7 @@ public RedditClassifier() {
5050 domainEncoder .setProbes (1 );
5151 }
5252
53- public RedditClassifier (int poolSize , int noOfFeatures ) {
53+ public RedditClassifier (final int poolSize , final int noOfFeatures ) {
5454 this .noOfFeatures = noOfFeatures ;
5555 classifier = new AdaptiveLogisticRegression (2 , noOfFeatures , new L2 ());
5656 classifier .setPoolSize (poolSize );
@@ -60,7 +60,7 @@ public RedditClassifier(int poolSize, int noOfFeatures) {
6060 domainEncoder .setProbes (1 );
6161 }
6262
63- public void trainClassifier (String fileName ) throws IOException {
63+ public void trainClassifier (final String fileName ) throws IOException {
6464 final List <NamedVector > vectors = extractVectors (readDataFile (fileName ));
6565 final int size = vectors .size ();
6666 final int noOfTraining = (int ) (size * 0.8 );
@@ -77,7 +77,7 @@ public void trainClassifier(String fileName) throws IOException {
7777 evaluateClassifier (testData );
7878 }
7979
80- public Vector convertPost (String title , String domain , int hour ) {
80+ public Vector convertPost (final String title , final String domain , final int hour ) {
8181 final Vector vector = new RandomAccessSparseVector (noOfFeatures );
8282 final List <String > words = Splitter .onPattern ("\\ W" ).omitEmptyStrings ().splitToList (title );
8383 vector .set (0 , hour );
@@ -89,7 +89,7 @@ public Vector convertPost(String title, String domain, int hour) {
8989 return vector ;
9090 }
9191
92- public int classify (Vector features ) {
92+ public int classify (final Vector features ) {
9393 if (learner == null ) {
9494 learner = classifier .getBest ().getPayload ().getLearner ();
9595 }
@@ -102,7 +102,7 @@ public double getAccuracy() {
102102
103103 // ==== Private methods
104104
105- private void evaluateClassifier (List <NamedVector > vectors ) throws IOException {
105+ private void evaluateClassifier (final List <NamedVector > vectors ) throws IOException {
106106 int category , result ;
107107 int correct = 0 ;
108108 int wrong = 0 ;
@@ -125,7 +125,7 @@ private void evaluateClassifier(List<NamedVector> vectors) throws IOException {
125125 this .accuracy = correct / (wrong + correct + 0.0 );
126126 }
127127
128- private List <String > readDataFile (String fileName ) throws IOException {
128+ private List <String > readDataFile (final String fileName ) throws IOException {
129129 List <String > lines = Files .readLines (new File (fileName ), Charset .forName ("utf-8" ));
130130 if ((lines == null ) || (lines .size () == 0 )) {
131131 new RedditDataCollector ().collectData ();
@@ -135,33 +135,44 @@ private List<String> readDataFile(String fileName) throws IOException {
135135 return lines ;
136136 }
137137
138- private List <NamedVector > extractVectors (List <String > lines ) {
138+ private List <NamedVector > extractVectors (final List <String > lines ) {
139139 final List <NamedVector > vectors = new ArrayList <NamedVector >(lines .size ());
140140 for (final String line : lines ) {
141141 vectors .add (extractVector (line ));
142142 }
143143 return vectors ;
144144 }
145145
146- private NamedVector extractVector (String line ) {
146+ private NamedVector extractVector (final String line ) {
147147 final String [] items = line .split ("," );
148- final String category = extractCategory (Integer .parseInt (items [0 ]));
148+ final String numberOfVotes = items [0 ];
149+ final String time = items [1 ];
150+ final String numberOfWordInTitle = items [2 ];
151+ final String title = items [3 ];
152+ final String theRootDomain = items [4 ];
153+
154+ final String category = extractCategory (Integer .parseInt (numberOfVotes ));
155+
149156 final NamedVector vector = new NamedVector (new RandomAccessSparseVector (noOfFeatures ), category );
150- final Calendar cal = Calendar .getInstance (TimeZone .getTimeZone ("GMT" ));
151- cal .setTimeInMillis (Long .parseLong (items [1 ]) * 1000 );
152157
158+ final Calendar cal = Calendar .getInstance (TimeZone .getTimeZone ("GMT" ));
159+ cal .setTimeInMillis (Long .parseLong (time ) * 1000 );
153160 vector .set (0 , cal .get (Calendar .HOUR_OF_DAY )); // hour of day
154- vector .set (1 , Integer .parseInt (items [2 ])); // number of words in the title
155- domainEncoder .addToVector (items [4 ], vector );
156- final String [] words = items [3 ].split (" " );
161+
162+ vector .set (1 , Integer .parseInt (numberOfWordInTitle )); // number of words in the title
163+
164+ domainEncoder .addToVector (theRootDomain , vector );
165+ final String [] words = title .split (" " );
157166 // titleEncoder.setProbes(words.length);
167+
168+ // TODO: use a Java 8 stream with filter and remove the 1 and 2 character words; example: "a", "of", "to"
158169 for (final String word : words ) {
159170 titleEncoder .addToVector (word , vector );
160171 }
161172 return vector ;
162173 }
163174
164- private String extractCategory (int score ) {
175+ private String extractCategory (final int score ) {
165176 return (score < MIN_SCORE ) ? "BAD" : "GOOD" ;
166177 }
167178
0 commit comments