itziakos
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎COPYING‎
Lines changed: 1 addition & 1 deletion b/‎COPYING‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/bench_multilabel_metrics.py‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/bench_multilabel_metrics.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/bench_plot_approximate_neighbors.py‎
Lines changed: 7 additions & 11 deletions b/‎benchmarks/bench_plot_approximate_neighbors.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎doc/images/lda_model_graph.png‎
18.6 KB b/‎doc/images/lda_model_graph.png‎
18.6 KB
diff --git a/‎doc/modules/classes.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/modules/classes.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/modules/decomposition.rst‎
Lines changed: 84 additions & 1 deletion b/‎doc/modules/decomposition.rst‎
Lines changed: 84 additions & 1 deletion
diff --git a/‎doc/modules/feature_extraction.rst‎
Lines changed: 2 additions & 2 deletions b/‎doc/modules/feature_extraction.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/modules/lda_qda.rst‎
Lines changed: 73 additions & 25 deletions b/‎doc/modules/lda_qda.rst‎
Lines changed: 73 additions & 25 deletions
diff --git a/‎doc/modules/model_evaluation.rst‎
Lines changed: 17 additions & 0 deletions b/‎doc/modules/model_evaluation.rst‎
Lines changed: 17 additions & 0 deletions
@@ -4,6 +4,7 @@
 /sklearn/cluster/_hierarchical.cpp -diff
 /sklearn/cluster/_k_means.c -diff
 /sklearn/datasets/_svmlight_format.c -diff
+/sklearn/decomposition/_online_lda.c -diff
 /sklearn/ensemble/_gradient_boosting.c -diff
 /sklearn/feature_extraction/_hashing.c -diff
 /sklearn/linear_model/cd_fast.c -diff
 
@@ -1,6 +1,6 @@
 New BSD License
 
-Copyright (c) 2007–2014 The scikit-learn developers.
+Copyright (c) 2007–2015 The scikit-learn developers.
 All rights reserved.
 
 
 
@@ -81,11 +81,9 @@ def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())),
     for i, (s, c, d) in enumerate(it):
         _, y_true = make_multilabel_classification(n_samples=s, n_features=1,
                                                    n_classes=c, n_labels=d * c,
-                                                   return_indicator=True,
                                                    random_state=42)
         _, y_pred = make_multilabel_classification(n_samples=s, n_features=1,
                                                    n_classes=c, n_labels=d * c,
-                                                   return_indicator=True,
                                                    random_state=84)
         for j, f in enumerate(formats):
             f_true = f(y_true)
 
@@ -125,20 +125,16 @@ def calc_accuracy(X, queries, n_queries, n_neighbors, exact_neighbors,
 
     # Set labels for LSHForest parameters
     colors = ['c', 'm', 'y']
-    p1 = plt.Rectangle((0, 0), 0.1, 0.1, fc=colors[0])
-    p2 = plt.Rectangle((0, 0), 0.1, 0.1, fc=colors[1])
-    p3 = plt.Rectangle((0, 0), 0.1, 0.1, fc=colors[2])
+    legend_rects = [plt.Rectangle((0, 0), 0.1, 0.1, fc=color)
+                    for color in colors]
 
-    labels = ['n_estimators=' + str(params_list[0]['n_estimators']) +
-              ', n_candidates=' + str(params_list[0]['n_candidates']),
-              'n_estimators=' + str(params_list[1]['n_estimators']) +
-              ', n_candidates=' + str(params_list[1]['n_candidates']),
-              'n_estimators=' + str(params_list[2]['n_estimators']) +
-              ', n_candidates=' + str(params_list[2]['n_candidates'])]
+    legend_labels = ['n_estimators={n_estimators}, '
+                     'n_candidates={n_candidates}'.format(**p)
+                     for p in params_list]
 
     # Plot precision
     plt.figure()
-    plt.legend((p1, p2, p3), (labels[0], labels[1], labels[2]),
+    plt.legend(legend_rects, legend_labels,
                loc='upper left')
 
     for i in range(len(params_list)):
@@ -154,7 +150,7 @@ def calc_accuracy(X, queries, n_queries, n_neighbors, exact_neighbors,
 
     # Plot speed up
     plt.figure()
-    plt.legend((p1, p2, p3), (labels[0], labels[1], labels[2]),
+    plt.legend(legend_rects, legend_labels,
                loc='upper left')
 
     for i in range(len(params_list)):
 
@@ -288,6 +288,7 @@ Samples generator
    decomposition.SparseCoder
    decomposition.DictionaryLearning
    decomposition.MiniBatchDictionaryLearning
+   decomposition.LatentDirichletAllocation
 
 .. autosummary::
    :toctree: generated/
@@ -1104,6 +1105,7 @@ See the :ref:`metrics` section of the user guide for further details.
    :template: class.rst
 
    preprocessing.Binarizer
+   preprocessing.FunctionTransformer
    preprocessing.Imputer
    preprocessing.KernelCenterer
    preprocessing.LabelBinarizer
 
@@ -706,7 +706,7 @@ the data.
 .. topic:: Examples:
 
     * :ref:`example_decomposition_plot_faces_decomposition.py`
-    * :ref:`example_applications_topics_extraction_with_nmf.py`
+    * :ref:`example_applications_topics_extraction_with_nmf_lda.py`
 
 .. topic:: References:
 
@@ -726,3 +726,86 @@ the data.
       matrix factorization"
       <http://scgroup.hpclab.ceid.upatras.gr/faculty/stratis/Papers/HPCLAB020107.pdf>`_
       C. Boutsidis, E. Gallopoulos, 2008
+
+
+.. _LatentDirichletAllocation:
+
+Latent Dirichlet Allocation (LDA)
+=================================
+
+Latent Dirichlet Allocation is a generative probabilistic model for collections of
+discrete dataset such as text corpora. It is also a topic model that is used for
+discovering abstract topics from a collection of documents.
+
+The graphical model of LDA is a three-level Bayesian model:
+
+.. image:: ../images/lda_model_graph.png
+   :align: center
+
+When modeling text corpora, the model assumes the following generative process for
+a corpus with :math:`D` documents and :math:`K` topics:
+
+  1. For each topic :math:`k`, draw :math:`\beta_k \sim Dirichlet(\eta),\: k =1...K`
+
+  2. For each document :math:`d`, draw :math:`\theta_d \sim Dirichlet(\alpha), \: d=1...D`
+
+  3. For each word :math:`i` in document :math:`d`:
+    a. Draw a topic index :math:`z_{di} \sim Multinomial(\theta_d)`
+    b. Draw the observed word :math:`w_{ij} \sim Multinomial(beta_{z_{di}}.)`
+
+For parameter estimation, the posterior distribution is:
+
+.. math::
+  p(z, \theta, \beta |w, \alpha, \eta) =
+    \frac{p(z, \theta, \beta|\alpha, \eta)}{p(w|\alpha, \eta)}
+
+Since the posterior is intractable, variational Bayesian method
+uses a simpler distribution :math:`q(z,\theta,\beta | \lambda, \phi, \gamma)`
+to approximate it, and those variational parameters :math:`\lambda`, :math:`\phi`,
+:math:`\gamma` are optimized to maximize the Evidence Lower Bound (ELBO):
+
+.. math::
+  log\: P(w | \alpha, \eta) \geq L(w,\phi,\gamma,\lambda) \overset{\triangle}{=}
+    E_{q}[log\:p(w,z,\theta,\beta|\alpha,\eta)] - E_{q}[log\:q(z, \theta, \beta)]
+
+Maximizing ELBO is equivalent to minimizing the Kullback-Leibler(KL) divergence 
+between :math:`q(z,\theta,\beta)` and the true posterior
+:math:`p(z, \theta, \beta |w, \alpha, \eta)`.
+
+:class:`LatentDirichletAllocation` implements online variational Bayes algorithm and supports
+both online and batch update method.
+While batch method updates variational variables after each full pass through the data,
+online method updates variational variables from mini-batch data points. Therefore,
+online method usually converges faster than batch method.
+
+.. note::
+
+  Although online method is guaranteed to converge to a local optimum point, the quality of
+  the optimum point and the speed of convergence may depend on mini-batch size and
+  attributes related to learning rate setting.
+
+When :class:`LatentDirichletAllocation` is applied on a "document-term" matrix, the matrix
+will be decomposed into a "topic-term" matrix and a "document-topic" matrix. While
+"topic-term" matrix is stored as :attr:`components_` in the model, "document-topic" matrix
+can be calculated from ``transform`` method.
+
+:class:`LatentDirichletAllocation` also implements ``partial_fit`` method. This is used
+when data can be fetched sequentially.
+
+.. topic:: Examples:
+
+    * :ref:`example_applications_topics_extraction_with_nmf_lda.py`
+
+.. topic:: References:
+
+    * `"Latent Dirichlet Allocation"
+      <https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf>`_
+      D. Blei, A. Ng, M. Jordan, 2003
+
+    * `"Online Learning for Latent Dirichlet Allocation”
+      <https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf>`_
+      M. Hoffman, D. Blei, F. Bach, 2010
+
+    * `"Stochastic Variational Inference"
+      <http://www.columbia.edu/~jwp2128/Papers/HoffmanBleiWangPaisley2013.pdf>`_
+      M. Hoffman, D. Blei, C. Wang, J. Paisley, 2013
@@ -398,7 +398,7 @@ suitable for usage by a classifier it is very common to use the tf–idf
 transform.
 
 Tf means **term-frequency** while tf–idf means term-frequency times
-**inverse document-frequency**. This is a originally a term weighting
+**inverse document-frequency**. This was originally a term weighting
 scheme developed for information retrieval (as a ranking function
 for search engines results), that has also found good use in document
 classification and clustering.
@@ -576,7 +576,7 @@ Finally it is possible to discover the main topics of a corpus by
 relaxing the hard assignment constraint of clustering, for instance by
 using :ref:`NMF`:
 
-  * :ref:`example_applications_topics_extraction_with_nmf.py`
+  * :ref:`example_applications_topics_extraction_with_nmf_lda.py`
 
 
 Limitations of the Bag of Words representation
 
@@ -8,58 +8,105 @@ Linear and quadratic discriminant analysis
 
 Linear discriminant analysis (:class:`lda.LDA`) and
 quadratic discriminant analysis (:class:`qda.QDA`)
-are two classic classifiers, with, as their names suggest, a linear and a
+are two standard classifiers, with, as their names suggest, a linear and a
 quadratic decision surface, respectively.
 
 These classifiers are attractive because they have closed-form solutions that
-can be easily computed, are inherently multiclass,
-and have proven to work well in practice.
-Also there are no parameters to tune for these algorithms.
+can be easily computed, are inherently multiclass, have proven to work well in practice and have
+no hyperparameters to tune.
 
 .. |ldaqda| image:: ../auto_examples/classification/images/plot_lda_qda_001.png
         :target: ../auto_examples/classification/plot_lda_qda.html
         :scale: 80
 
 .. centered:: |ldaqda|
 
-The plot shows decision boundaries for LDA and QDA. The bottom row
-demonstrates that LDA can only learn linear boundaries, while QDA can learn
+The plot shows decision boundaries for LDA and QDA. The first row shows that,
+when the classes covariances are the same, LDA and QDA yield the same result 
+(up to a small difference resulting from the implementation). The bottom row demonstrates that in general, 
+LDA can only learn linear boundaries, while QDA can learn
 quadratic boundaries and is therefore more flexible.
 
 .. topic:: Examples:
 
     :ref:`example_classification_plot_lda_qda.py`: Comparison of LDA and QDA on synthetic data.
 
-
 Dimensionality reduction using LDA
 ==================================
 
-:class:`lda.LDA` can be used to perform supervised dimensionality reduction by
-projecting the input data to a subspace consisting of the most
-discriminant directions.
+:class:`lda.LDA` can be used to perform supervised dimensionality reduction, by
+projecting the input data to a linear subspace consisting of the directions which maximize the
+separation between classes (in a precise sense discussed in the mathematics section below). 
+The dimension of the output is necessarily less that the number of classes, 
+so this is a in general a rather strong dimensionality reduction, and only makes senses 
+in a multiclass setting.
+
 This is implemented in :func:`lda.LDA.transform`. The desired
 dimensionality can be set using the ``n_components`` constructor
 parameter. This parameter has no influence on :func:`lda.LDA.fit` or :func:`lda.LDA.predict`.
 
+.. topic:: Examples:
+
+    :ref:`example_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and PCA for dimensionality reduction of the Iris dataset
+
+Mathematical formulation of the LDA and QDA classifiers
+=======================================================
+
+Both LDA and QDA can be derived from simple probabilistic models 
+which model the class conditional distribution of the data :math:`P(X|y=k)`
+for each class :math:`k`. Predictions can then be obtained by using Bayes' rule:
+
+.. math::
+    P(y=k | X) = \frac{P(X | y=k) P(y=k)}{P(X)} = \frac{P(X | y=k) P(y = k)}{ \sum_{l} P(X | y=l) \cdot P(y=l)}
+
+and we select the class :math:`k` which maximizes this conditional probability.
+
+More specifically, for linear and quadratic discriminant analysis, :math:`P(X|y)`
+is modelled as a multivariate Gaussian distribution with density:
 
-Mathematical Idea
-=================
+.. math:: p(X | y=k) = \frac{1}{(2\pi)^n |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right)
 
-Both methods work by modeling the class conditional distribution of the data :math:`P(X|y=k)`
-for each class :math:`k`. Predictions can be obtained by using Bayes' rule:
+To use this model as a classifier, we just need to estimate from the training data
+the class priors :math:`P(y=k)` (by the proportion of instances of class :math:`k`), the
+class means :math:`\mu_k` (by the empirical sample class means) and the covariance matrices 
+(either by the empirical sample class covariance matrices, or by a regularized estimator: see the section on shrinkage below).
+
+In the case of LDA, the Gaussians for each class are assumed 
+to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`.
+This leads to linear decision surfaces between, as can be seen by comparing the the log-probability ratios
+:math:`\log[P(y=k | X) / P(y=l | X)]`:
 
 .. math::
-    P(y | X) = P(X | y) \cdot P(y) / P(X) = P(X | y) \cdot P(Y) / ( \sum_{y'} P(X | y') \cdot p(y'))
+   \log\left(\frac{P(y=k|X)}{P(y=l | X)}\right) = 0 \Leftrightarrow (\mu_k-\mu_l)\Sigma^{-1} X = \frac{1}{2} (\mu_k^t \Sigma^{-1} \mu_k - \mu_l^t \Sigma^{-1} \mu_l)
+
+In the case of QDA, there are no assumptions on the covariance matrices :math:`\Sigma_k` of the Gaussians,
+leading to quadratic decision surfaces. See [#1]_ for more details.
+
+.. note:: **Relation with Gaussian Naive Bayes**
+
+	  If in the QDA model one assumes that the covariance matrices are diagonal, then
+	  this means that we assume the classes are conditionally independent,
+	  and the resulting classifier is equivalent to the Gaussian Naive Bayes classifier :class:`GaussianNB`.
+
+Mathematical formulation of LDA dimensionality reduction
+===========================================================
+
+To understand the use of LDA in dimensionality reduction, it is useful to start
+with a geometric reformulation of the LDA classification rule explained above.
+We write :math:`K` for the total number of target classes.
+Since in LDA we assume that all classes have the same estimated covariance :math:`\Sigma`, we can rescale the 
+data so that this covariance is the identity:
 
-In linear and quadratic discriminant analysis, :math:`P(X|y)`
-is modelled as a Gaussian distribution.
-In the case of LDA, the Gaussians for each class are assumed to share the same covariance matrix.
-This leads to a linear decision surface, as can be seen by comparing the the log-probability rations
-:math:`log[P(y=k | X) / P(y=l | X)]`.
+.. math:: X^* = D^{-1/2}U^t X\text{ with }\Sigma = UDU^t
 
-In the case of QDA, there are no assumptions on the covariance matrices of the Gaussians,
-leading to a quadratic decision surface.
+Then one can show that to classify a data point after scaling is equivalent to finding the estimated class mean :math:`\mu^*_k` which is 
+closest to the data point in the Euclidean distance. But this can be done just as well after projecting on the :math:`K-1` affine subspace :math:`H_K`
+generated by all the :math:`\mu^*_k` for all classes. This shows that, implicit in the LDA classifier, there is
+a dimensionality reduction by linear projection onto a :math:`K-1` dimensional space.
 
+We can reduce the dimension even more, to a chosen :math:`L`, by projecting onto the linear subspace :math:`H_L` which
+maximize the variance of the :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the transformed class means :math:`\mu^*_k`).
+This :math:`L` corresponds to the ``n_components`` parameter in the :func:`lda.LDA.transform` method. See [#1]_ for more details.
 
 Shrinkage
 =========
@@ -70,7 +117,7 @@ features. In this scenario, the empirical sample covariance is a poor
 estimator. Shrinkage LDA can be used by setting the ``shrinkage`` parameter of
 the :class:`lda.LDA` class to 'auto'. This automatically determines the
 optimal shrinkage parameter in an analytic way following the lemma introduced
-by Ledoit and Wolf. Note that currently shrinkage only works when setting the
+by Ledoit and Wolf [#2]_. Note that currently shrinkage only works when setting the
 ``solver`` parameter to 'lsqr' or 'eigen'.
 
 The ``shrinkage`` parameter can also be manually set between 0 and 1. In
@@ -111,7 +158,8 @@ a high number of features.
 
 .. topic:: References:
 
-    Hastie T, Tibshirani R, Friedman J. The Elements of Statistical Learning. Springer, 2009.
+   .. [#1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+        Friedman J., Section 4.3, p.106-119, 2008.
 
-    Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. The Journal of Portfolio
+   .. [#2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. The Journal of Portfolio
     Management 30(4), 110-119, 2004.
@@ -353,6 +353,23 @@ In the multilabel case with binary label indicators: ::
     for an example of accuracy score usage using permutations of
     the dataset.
 
+.. _cohen_kappa:
+
+Cohen's kappa
+-------------
+
+The function :func:`cohen_kappa_score` computes Cohen's kappa statistic.
+This measure is intended to compare labelings by different human annotators,
+not a classifier versus a ground truth.
+
+The kappa score (see docstring) is a number between -1 and 1.
+Scores above .8 are generally considered good agreement;
+zero or lower means no agreement (practically random labels).
+
+Kappa scores can be computed for binary or multiclass problems,
+but not for multilabel problems (except by manually computing a per-label score)
+and not for more than two annotators.
+
 .. _confusion_matrix:
 
 Confusion matrix