Skip to content

Commit f3c5d9c

Browse files
Merge pull request dmlc#675 from pommedeterresautee/master
Generate new features based on tree leafs
2 parents 162e91c + c1b2d9c commit f3c5d9c

File tree

6 files changed

+188
-9
lines changed

6 files changed

+188
-9
lines changed

R-package/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export(setinfo)
55
export(slice)
66
export(xgb.DMatrix)
77
export(xgb.DMatrix.save)
8+
export(xgb.create.features)
89
export(xgb.cv)
910
export(xgb.dump)
1011
export(xgb.importance)
@@ -25,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
2526
importClassesFrom(Matrix,dgeMatrix)
2627
importFrom(Matrix,cBind)
2728
importFrom(Matrix,colSums)
29+
importFrom(Matrix,sparse.model.matrix)
2830
importFrom(Matrix,sparseVector)
2931
importFrom(data.table,":=")
3032
importFrom(data.table,as.data.table)

R-package/R/xgb.create.features.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#' Create new features from a previously learned model
2+
#'
3+
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
4+
#'
5+
#' @importFrom magrittr %>%
6+
#' @importFrom Matrix cBind
7+
#' @importFrom Matrix sparse.model.matrix
8+
#'
9+
#' @param model decision tree boosting model learned on the original data
10+
#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
11+
#'
12+
#' @return \code{dgCMatrix} matrix including both the original data and the new features.
13+
#'
14+
#' @details
15+
#' This is the function inspired from the paragraph 3.1 of the paper:
16+
#'
17+
#' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
18+
#'
19+
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
20+
#' Joaquin Quiñonero Candela)}
21+
#'
22+
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
23+
#'
24+
#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
25+
#'
26+
#' Extract explaining the method:
27+
#'
28+
#' "\emph{We found that boosted decision trees are a powerful and very
29+
#' convenient way to implement non-linear and tuple transformations
30+
#' of the kind we just described. We treat each individual
31+
#' tree as a categorical feature that takes as value the
32+
#' index of the leaf an instance ends up falling in. We use
33+
#' 1-of-K coding of this type of features.
34+
#'
35+
#' For example, consider the boosted tree model in Figure 1 with 2 subtrees,
36+
#' where the first subtree has 3 leafs and the second 2 leafs. If an
37+
#' instance ends up in leaf 2 in the first subtree and leaf 1 in
38+
#' second subtree, the overall input to the linear classifier will
39+
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
40+
#' correspond to the leaves of the first subtree and last 2 to
41+
#' those of the second subtree.
42+
#'
43+
#' [...]
44+
#'
45+
#' We can understand boosted decision tree
46+
#' based transformation as a supervised feature encoding that
47+
#' converts a real-valued vector into a compact binary-valued
48+
#' vector. A traversal from root node to a leaf node represents
49+
#' a rule on certain features.}"
50+
#'
51+
#' @examples
52+
#' data(agaricus.train, package='xgboost')
53+
#' data(agaricus.test, package='xgboost')
54+
#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
55+
#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
56+
#'
57+
#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
58+
#' nround = 4
59+
#'
60+
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
61+
#'
62+
#' # Model accuracy without new features
63+
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
64+
#'
65+
#' # Convert previous features to one hot encoding
66+
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
67+
#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
68+
#'
69+
#' # learning with new features
70+
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
71+
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
72+
#' watchlist <- list(train = new.dtrain)
73+
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
74+
#'
75+
#' # Model accuracy with new features
76+
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
77+
#'
78+
#' # Here the accuracy was already good and is now perfect.
79+
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
80+
#'
81+
#' @export
82+
xgb.create.features <- function(model, training.data){
83+
pred_with_leaf = predict(model, training.data, predleaf = TRUE)
84+
cols <- list()
85+
for(i in 1:length(trees)){
86+
# max is not the real max but it s not important for the purpose of adding features
87+
leaf.id <- sort(unique(pred_with_leaf[,i]))
88+
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
89+
}
90+
cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
91+
}

R-package/R/xgb.importance.R

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#' Show importance of features in a model
22
#'
3-
#' Read a xgboost model text dump.
4-
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
3+
#' Create a \code{data.table} of the most important features of a model.
54
#'
65
#' @importFrom data.table data.table
76
#' @importFrom data.table setnames

R-package/demo/predict_leaf_indices.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,14 @@ pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
2525
head(pred_with_leaf)
2626

2727
create.new.tree.features <- function(model, original.features){
28-
pred_with_leaf = predict(model, original.features, predleaf = TRUE)
28+
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
2929
cols <- list()
3030
for(i in 1:length(trees)){
3131
# max is not the real max but it s not important for the purpose of adding features
32-
max <- max(pred_with_leaf[,i])
33-
cols[[i]] <- factor(x = pred_with_leaf[,i], level = seq(to = max))
32+
leaf.id <- sort(unique(pred_with_leaf[,i]))
33+
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
3434
}
35-
cBind(original.features, sparse.model.matrix( ~ ., as.data.frame(cols)))
35+
cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
3636
}
3737

3838
# Convert previous features to one hot encoding
@@ -49,4 +49,4 @@ bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread =
4949
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
5050

5151
# Here the accuracy was already good and is now perfect.
52-
print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!"))
52+
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))

R-package/man/xgb.create.features.Rd

Lines changed: 88 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

R-package/man/xgb.importance.Rd

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)