Customer Churn Prediction Analysis in R

  1. Introduction
    1. About Dataset
  2. Analysis in R Studio
    1. Install and Load Package
    2. Load Data
    3. Data Preprocessing
      1. Check data
      2. Change column names to lowercase and Check missing value
      3. Create a new data frame without the customerid and techsupport column and Removes all rows containing “None” values
      4. Convert character data type to factor data type
    4. Exploration Data Analysis
      1. Create the individual plots by ggplot
      2. Arrange the plots in a grid
    5. Build Machine Learning Model
      1. Split Data
      2. Logistic Regression with K-Fold CV
        1. Train Model
        2. Test Model
      3. Decision Tree with Repeated K-Fold CV
        1. Train Model
        2. Test Model
      4. Random Forest with Repeated K-Fold CV
        1. Train Model
        2. Test Model
    6. Conclusion Accuracy

Introduction

Customer Churn Prediction in the Telecom Industry by Kaggle

About Dataset

This dataset provides a comprehensive view of customer behavior and churn in the telecom industry. It includes detailed information on customer demographics, service usage, and various indicators that are critical for analyzing customer retention and churn.

  • CustomerID : Unique identifier for each customer.
  • Age : Age of the customer, reflecting their demographic profile.
  • Gender : Gender of the customer (Male or Female).
  • Tenure : Duration (in months) the customer has been with the service provider.
  • MonthlyCharges : The monthly fee charged to the customer.
  • ContractType : Type of contract the customer is on (Month-to-Month, One-Year, Two-Year).
  • InternetService: Type of internet service subscribed to (DSL, Fiber Optic, None).
  • TechSupport : Whether the customer has tech support (Yes or No).
  • TotalCharges : Total amount charged to the customer (calculated as MonthlyCharges * Tenure).
  • Churn : Target variable indicating whether the customer has churned (Yes or No).

Analysis in R Studio

Install and Load Package

# Install package
install.packages("tidyverse")
install.packages("caret")
install.packages("gridExtra")

# Load package 

library("tidyverse")
library("caret")
library("gridExtra")
library("glue")

Load Data

df <- read_csv("customer_churn_data.csv")

Data Preprocessing

Check data

glimpse(df)
head(df)
tail(df)
names(df)
summary(df)

Change column names to lowercase and Check missing value

names(df) <- tolower(names(df))
names(df)

sum(is.na(df))

Create a new data frame without the customerid and techsupport column and Removes all rows containing “None” values

df%>%
  select(internetservice) %>%
  group_by(internetservice) %>%
  count()

df <- df %>%
  select(-1, -9) %>%
  filter(internetservice != "None")

df%>%
  select(internetservice) %>%
  group_by(internetservice) %>%
  count()

Convert character data type to factor data type

df <- df %>%
  mutate_if(is.character, as.factor)
glimpse(df)

Exploration Data Analysis

Create the individual plots by ggplot

p1 <- ggplot(df, aes(gender, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Gender by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p2 <- ggplot(df, aes(contracttype, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Contract Type by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p3 <- ggplot(df, aes(internetservice, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Internet Service by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p4 <- ggplot(df, aes(churn, monthlycharges, fill = churn)) +
  geom_boxplot() + 
  theme_minimal() + 
  ggtitle("Monthly Charges vs. Churn") +
  theme(plot.title = element_text(hjust = 0.5))

Arrange the plots in a grid

grid.arrange(p1, p2, p3, p4, ncol = 2)

Build Machine Learning Model

Split Data

Train Data 80 % and Test Data 20 %

set.seed(42)
n <- nrow(df)
id <- sample(1:n, size = 0.8*n, replace = FALSE)
train_data <- df[id, ] 
test_data <- df[-id, ]

Logistic Regression with K-Fold CV

Train Model
set.seed(42)
ctrl <- trainControl(method = "cv",
                     number = 5)
logit_model <- train(churn ~ .,
                     data = train_data,
                     method = "glm",
                     trControl = ctrl)
accuracy_logit_train <- round(logit_model$results$Accuracy, 4)*100
accuracy_logit_train_p <- glue("{accuracy_logit_train}%")
Test Model
p_logit <- predict(logit_model, newdata = test_data)

confusionMatrix(p_logit, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_logit <- round(mean( p_logit == test_data$churn), 4)*100
accuracy_logit_p <- glue("{accuracy_logit}%")

Decision Tree with Repeated K-Fold CV

Train Model
set.seed(42)
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 5,
                     verboseIter = TRUE)
tree_model <- train(churn ~ .,
                     data = train_data,
                     method = "rpart",
                     trControl = ctrl)
accuracy_tree_train <- round(max(tree_model$results$Accuracy), 4)*100
accuracy_tree_train_p <- glue("{accuracy_tree_train}%")
Test Model
p_tree <- predict(tree_model, newdata = test_data)

confusionMatrix(p_tree, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_tree <- round(mean( p_tree == test_data$churn),4)*100

accuracy_tree_p <- glue("{accuracy_tree}%")

Random Forest with Repeated K-Fold CV

Train Model
set.seed(42)
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 5,
                     verboseIter = TRUE)
rf_model <- train(churn ~ .,
                    data = train_data,
                    method = "rf",
                    trControl = ctrl)
accuracy_rf_train <- round(max(rf_model$results$Accuracy), 4)*100
accuracy_rf_train_p <- glue("{accuracy_rf_train}%")
Test Model
p_rf <- predict(rf_model, newdata = test_data)

confusionMatrix(p_rf, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_rf <- round(mean( p_rf == test_data$churn), 4) * 100
accuracy_rf_p <- glue("{accuracy_rf}%")

Conclusion Accuracy

conclusion <- data.frame(
  model = c("Logistic Regression", "Decision Tree", "Random Forest"),
  train_accuracy = c(accuracy_logit_train_p, accuracy_tree_train_p, accuracy_rf_train_p),
  test_accuracy = c(accuracy_logit_p, accuracy_tree_p, accuracy_rf_p)
)

Comments

Leave a comment