Skip to content

Commit 6a45931

Browse files
authored
Merge pull request microsoft#101 from ZhouFang928/master
New version of telco customer churn demo
2 parents d9254d0 + b93105d commit 6a45931

21 files changed

+22327
-0
lines changed

samples/features/r-services/Telco Customer Churn v1/Data/edw_cdr.csv

Lines changed: 20469 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"state","latitude","longitude"
2+
AK,61.3850,-152.2683
3+
AL,32.7990,-86.8073
4+
AR,34.9513,-92.3809
5+
AS,14.2417,-170.7197
6+
AZ,33.7712,-111.3877
7+
CA,36.1700,-119.7462
8+
CO,39.0646,-105.3272
9+
CT,41.5834,-72.7622
10+
DC,38.8964,-77.0262
11+
DE,39.3498,-75.5148
12+
FL,27.8333,-81.7170
13+
GA,32.9866,-83.6487
14+
HI,21.1098,-157.5311
15+
IA,42.0046,-93.2140
16+
ID,44.2394,-114.5103
17+
IL,40.3363,-89.0022
18+
IN,39.8647,-86.2604
19+
KS,38.5111,-96.8005
20+
KY,37.6690,-84.6514
21+
LA,31.1801,-91.8749
22+
MA,42.2373,-71.5314
23+
MD,39.0724,-76.7902
24+
ME,44.6074,-69.3977
25+
MI,43.3504,-84.5603
26+
MN,45.7326,-93.9196
27+
MO,38.4623,-92.3020
28+
MP,14.8058,145.5505
29+
MS,32.7673,-89.6812
30+
MT,46.9048,-110.3261
31+
NC,35.6411,-79.8431
32+
ND,47.5362,-99.7930
33+
NE,41.1289,-98.2883
34+
NH,43.4108,-71.5653
35+
NJ,40.3140,-74.5089
36+
NM,34.8375,-106.2371
37+
NV,38.4199,-117.1219
38+
NY,42.1497,-74.9384
39+
OH,40.3736,-82.7755
40+
OK,35.5376,-96.9247
41+
OR,44.5672,-122.1269
42+
PA,40.5773,-77.2640
43+
PR,18.2766,-66.3350
44+
RI,41.6772,-71.5101
45+
SC,33.8191,-80.9066
46+
SD,44.2853,-99.4632
47+
TN,35.7449,-86.7489
48+
TX,31.1060,-97.6475
49+
UT,40.1135,-111.8535
50+
VA,37.7680,-78.2057
51+
VI,18.0001,-64.8199
52+
VT,44.0407,-72.7093
53+
WA,47.3917,-121.5708
54+
WI,44.2563,-89.6385
55+
WV,38.4680,-80.9696
56+
WY,42.7475,-107.2085
49.4 KB
Loading
42.9 KB
Loading
126 KB
Loading
1.67 MB
Loading
673 KB
Loading
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
**Instructions**
2+
3+
4+
- Run the telcoChurn-main.R to drive the R demo
5+
- Run the telcoChurn-modelComparison.R to compare different algorithms that we tried to build churn models
6+
7+
8+
9+
----------
10+
**Description**
11+
12+
- **telcoChurn-setUp.R** - Setting up relevant R packages
13+
- **telcoChurn-evaluate.R** - Defining pre-functions for model evaluation
14+
- **telcoChurn-dataExploration.R** - Creating a Shiny application to explore and visualize the data
15+
- **telcoChurn-dataPreparation.R** - Defining functions to do data pre-processing and spliting in order to generate suitable training and testing data sets
16+
- **telcoChurn-trainModel.R** - Defining a function to train the telco churn model with rxDForest algorithm
17+
- **telcoChurn-main.R** - Main R file driving the demo execution
18+
- **telcoChurn-modelComparison.R** - R file to build and compare different tree-based classification models, including CRAN R algorithms - randomForest and xgboost, RevoScaleR algorithms – rxDForest and rxBTrees
19+
20+
----------
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
####################################################################################################
2+
## Title: Telco Customer Churn
3+
## Description: Data Exploration and Visualization
4+
## Author: Microsoft
5+
####################################################################################################
6+
7+
library(shiny)
8+
library(leaflet)
9+
library(jsonlite)
10+
library(dplyr)
11+
library(ggplot2)
12+
13+
## Load data from SQL
14+
# cdrDF <- rxImport(inData = cdrSQL)
15+
16+
## Load data from local
17+
cdrFile <- file.path(wd, "Data", "edw_cdr.csv")
18+
cdrDF <- read.csv(file = cdrFile, header = TRUE, sep = ",")
19+
20+
latlonFile <- file.path(wd, "Data", "state_latlon.csv")
21+
latlonDF <- read.csv(file = latlonFile, header = TRUE, sep = ",")
22+
23+
24+
data <- cdrDF %>%
25+
group_by(state) %>%
26+
summarise(complaintsbystate = sum(as.numeric(numberofcomplaints)),
27+
churnbystate = sum(as.numeric(churn))) %>%
28+
mutate(lab = paste0("<center>", "state,", state, ": ", "<br>",
29+
"complaintsbystate,", complaintsbystate, "<br>",
30+
"churnbystate,", churnbystate, "</center>")) %>%
31+
left_join(cdrDF, by = "state") %>%
32+
left_join(latlonDF, by = "state")
33+
34+
ui <- fluidPage(
35+
tags$style(HTML("
36+
@import url('https://fonts.googleapis.com/css?family=Poppins');
37+
38+
body {
39+
40+
font-family: 'Poppins', 'Lucida Grande', Verdana, Lucida, Helvetica, Arial, Calibri, sans-serif;
41+
color: rgb(0,0,0);
42+
background-color: #d2d2d2;
43+
}
44+
")),
45+
46+
titlePanel("Telco Customer Churn"),
47+
48+
# Sidebar with a slider input for number of bins
49+
sidebarLayout(
50+
sidebarPanel(
51+
sliderInput("sc", "Scale size of circles (also redraws map to show only the last added state)",
52+
min = 0.5, max = 5, value = 1, step = 0.1),
53+
p(),
54+
selectInput("state", "Select a state to add to the map",
55+
choices = c("", data$state), selected = "",
56+
size = , selectize = FALSE),
57+
actionButton("clear1", "Clear all states"),
58+
p(),
59+
p("Proportion of customer churn"),
60+
plotOutput("MyPlot1", height = "200px"),
61+
p(),
62+
p("Impact of education on churn"),
63+
plotOutput("MyPlot2", height = "200px"),
64+
p(),
65+
p("Impact of call failure rate on churn"),
66+
plotOutput("MyPlot3", height = "200px"),
67+
h2("About"),
68+
HTML("<p>Created by Fang Zhou with R and Shiny leaflet. R users can download the
69+
cleaned and tidy call detail record data from <a href = 'https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn'>
70+
https://github.com/Microsoft/sql-server-samples/tree/master/samples/features/r-services/Telco%20Customer%20Churn</a>.
71+
The latitute and longitute for each USA state can be found from <a href = 'http://dev.maxmind.com/geoip/legacy/codes/state_latlon/'>
72+
http://dev.maxmind.com/geoip/legacy/codes/state_latlon/</a>.")
73+
),
74+
75+
76+
mainPanel(
77+
leafletOutput("MyMap", height = 1000)
78+
79+
)
80+
)
81+
)
82+
83+
server <- function(input, output, session) {
84+
85+
the_data_state <- reactive({
86+
tmp <- data %>%
87+
filter(state == input$state)
88+
89+
if (input$state != "") {
90+
thecol <- data.frame(data)[data$state == input$state, "colour"]
91+
} else {
92+
tmp <- data[1,]
93+
thecol <- NULL
94+
95+
}
96+
97+
return(list(df = tmp, thecol = thecol))
98+
})
99+
100+
output$MyMap <- renderLeaflet({
101+
leaflet() %>%
102+
addProviderTiles("Stamen.Watercolor") %>%
103+
addProviderTiles("Stamen.TonerLabels") %>%
104+
fitBounds(-120, 30, -60, 50)
105+
})
106+
107+
observe({
108+
leafletProxy("MyMap", data = the_data_state()$df) %>%
109+
addCircleMarkers( ~ longitude,
110+
~ latitude,
111+
color = the_data_state()$thecol,
112+
radius = ~churnbystate * 0.1 * input$sc,
113+
popup = ~lab)
114+
})
115+
116+
observe({
117+
x <- input$clear1
118+
updateSelectInput(session, "state", selected = "")
119+
leafletProxy("MyMap") %>% clearMarkers()
120+
})
121+
122+
observe({
123+
x <- input$sc
124+
leafletProxy("MyMap") %>% clearMarkers()
125+
})
126+
127+
128+
output$MyPlot1 <- renderPlot({
129+
cdrDF %>%
130+
ggplot(aes(x = factor(1), fill = factor(churn))) +
131+
geom_bar(width = 1) +
132+
coord_polar(theta = "y") +
133+
theme_minimal()
134+
})
135+
136+
output$MyPlot2 <- renderPlot({
137+
cdrDF %>%
138+
group_by(month, education) %>%
139+
summarize(countofchurn = sum(as.numeric(churn))) %>%
140+
ggplot(aes(x = month, y = countofchurn,
141+
group = education, fill = education)) +
142+
geom_bar(stat = "identity", position = position_dodge()) +
143+
labs(x = "month", y = "Counts of churn") +
144+
theme_minimal()
145+
})
146+
147+
output$MyPlot3 <- renderPlot({
148+
data %>%
149+
group_by(month, callfailurerate) %>%
150+
summarize(countofchurn = sum(as.numeric(churn))) %>%
151+
ggplot(aes(x = month, y = countofchurn,
152+
group = factor(callfailurerate), fill = factor(callfailurerate))) +
153+
geom_bar(stat = "identity", position = position_dodge()) +
154+
labs(x = "month", y = "Counts of churn") +
155+
theme_minimal()
156+
})
157+
}
158+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
####################################################################################################
2+
## Title: Telco Customer Churn
3+
## Description: Data Preparation
4+
## Author: Microsoft
5+
## Note: Prepare the training and testing data sets by pre-processing and spliting on raw data
6+
####################################################################################################
7+
8+
dataPreparation <- function(sqlSettings, trainTable, testTable) {
9+
sqlConnString <- sqlSettings$connString
10+
11+
## Query necessary columns from the call detail record table
12+
dataVars <- rxGetVarNames(cdrSQL)
13+
dataVars <- dataVars[!dataVars %in% c("year", "month")]
14+
dataVars <- paste(dataVars, collapse = ", ")
15+
dataQuery <- paste("select", dataVars, "from", inputTable)
16+
17+
## Create sql server data sources
18+
inputDataSQL = RxSqlServerData(sqlQuery = dataQuery,
19+
connectionString = sqlConnString,
20+
colInfo = cdrColInfo)
21+
trainDataSQL <- RxSqlServerData(connectionString = sqlConnString,
22+
table = trainTable,
23+
colInfo = cdrColInfo)
24+
testDataSQL <- RxSqlServerData(connectionString = sqlConnString,
25+
table = testTable,
26+
colInfo = cdrColInfo)
27+
28+
## Data pre-processing: cleaning and splitting followed by SMOTE
29+
rxExec(preProcess, inData = inputDataSQL, outData1 = trainDataSQL, outData2 = testDataSQL)
30+
}
31+
32+
preProcess <- function(inData, outData1, outData2) {
33+
## Clean missing data
34+
## Remove duplicate rows
35+
cdrDF <- rxDataStep(inData = inData,
36+
removeMissings = TRUE,
37+
overwrite = TRUE)
38+
cdrDF <- cdrDF[!duplicated(cdrDF),]
39+
40+
## Split data
41+
set.seed(1234)
42+
splitFile <- rxSplit(inData = cdrDF,
43+
outFilesBase = "trainTestData",
44+
splitByFactor = "ind",
45+
transforms = list(ind = factor(sample(0:1, size = .rxNumRows, replace = TRUE, prob = c(0.3, 0.7)),
46+
levels = 0:1,
47+
labels = c("Test", "Train"))),
48+
overwrite = TRUE)
49+
trainFile <- splitFile[[2]]
50+
testFile <- splitFile[[1]]
51+
52+
## SMOTE on training data
53+
trainDF <- rxDataStep(inData = trainFile, varsToDrop = c("ind"))
54+
testDF <- rxDataStep(inData = testFile, varsToDrop = c("ind"))
55+
56+
library(unbalanced)
57+
trainVars <- names(trainDF)
58+
trainVarsInd <- trainVars %in% c("churn")
59+
smotetrain <- ubSMOTE(X = trainDF[!trainVarsInd], Y = trainDF$churn,
60+
perc.over = 200, perc.under = 500,
61+
k = 3, verbose = TRUE)
62+
smotetrainDF <- cbind(smotetrain$X, smotetrain$Y)
63+
names(smotetrainDF)[names(smotetrainDF) == "smotetrain$Y"] <- "churn"
64+
trainDF <- smotetrainDF
65+
66+
## Load final training data and testing data into SQL
67+
rxDataStep(inData = trainDF, outFile = outData1, overwrite = TRUE)
68+
rxDataStep(inData = testDF, outFile = outData2, overwrite = TRUE)
69+
}

0 commit comments

Comments
 (0)