Customer Churn Prediction Analysis in R

Introduction

Customer Churn Prediction in the Telecom Industry by Kaggle

About Dataset

This dataset provides a comprehensive view of customer behavior and churn in the telecom industry. It includes detailed information on customer demographics, service usage, and various indicators that are critical for analyzing customer retention and churn.

CustomerID : Unique identifier for each customer.
Age : Age of the customer, reflecting their demographic profile.
Gender : Gender of the customer (Male or Female).
Tenure : Duration (in months) the customer has been with the service provider.
MonthlyCharges : The monthly fee charged to the customer.
ContractType : Type of contract the customer is on (Month-to-Month, One-Year, Two-Year).
InternetService: Type of internet service subscribed to (DSL, Fiber Optic, None).
TechSupport : Whether the customer has tech support (Yes or No).
TotalCharges : Total amount charged to the customer (calculated as MonthlyCharges * Tenure).
Churn : Target variable indicating whether the customer has churned (Yes or No).

Analysis in R Studio

Install and Load Package

# Install package
install.packages("tidyverse")
install.packages("caret")
install.packages("gridExtra")

# Load package 

library("tidyverse")
library("caret")
library("gridExtra")
library("glue")

Load Data

df <- read_csv("customer_churn_data.csv")

Data Preprocessing

Check data

glimpse(df)
head(df)
tail(df)
names(df)
summary(df)

Change column names to lowercase and Check missing value

names(df) <- tolower(names(df))
names(df)

sum(is.na(df))

Create a new data frame without the customerid and techsupport column and Removes all rows containing “None” values

df%>%
  select(internetservice) %>%
  group_by(internetservice) %>%
  count()

df <- df %>%
  select(-1, -9) %>%
  filter(internetservice != "None")

df%>%
  select(internetservice) %>%
  group_by(internetservice) %>%
  count()

Convert character data type to factor data type

df <- df %>%
  mutate_if(is.character, as.factor)
glimpse(df)

Exploration Data Analysis

Create the individual plots by ggplot

p1 <- ggplot(df, aes(gender, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Gender by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p2 <- ggplot(df, aes(contracttype, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Contract Type by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p3 <- ggplot(df, aes(internetservice, fill = churn)) + 
  geom_bar(position = position_dodge()) + 
  theme_minimal() +
  ggtitle("Internet Service by Churn") +
  theme(plot.title = element_text(hjust = 0.5))
  
p4 <- ggplot(df, aes(churn, monthlycharges, fill = churn)) +
  geom_boxplot() + 
  theme_minimal() + 
  ggtitle("Monthly Charges vs. Churn") +
  theme(plot.title = element_text(hjust = 0.5))

Arrange the plots in a grid

grid.arrange(p1, p2, p3, p4, ncol = 2)

Build Machine Learning Model

Split Data

Train Data 80 % and Test Data 20 %

set.seed(42)
n <- nrow(df)
id <- sample(1:n, size = 0.8*n, replace = FALSE)
train_data <- df[id, ] 
test_data <- df[-id, ]

Logistic Regression with K-Fold CV

Train Model

set.seed(42)
ctrl <- trainControl(method = "cv",
                     number = 5)
logit_model <- train(churn ~ .,
                     data = train_data,
                     method = "glm",
                     trControl = ctrl)
accuracy_logit_train <- round(logit_model$results$Accuracy, 4)*100
accuracy_logit_train_p <- glue("{accuracy_logit_train}%")

Test Model

p_logit <- predict(logit_model, newdata = test_data)

confusionMatrix(p_logit, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_logit <- round(mean( p_logit == test_data$churn), 4)*100
accuracy_logit_p <- glue("{accuracy_logit}%")

Decision Tree with Repeated K-Fold CV

Train Model

set.seed(42)
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 5,
                     verboseIter = TRUE)
tree_model <- train(churn ~ .,
                     data = train_data,
                     method = "rpart",
                     trControl = ctrl)
accuracy_tree_train <- round(max(tree_model$results$Accuracy), 4)*100
accuracy_tree_train_p <- glue("{accuracy_tree_train}%")

Test Model

p_tree <- predict(tree_model, newdata = test_data)

confusionMatrix(p_tree, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_tree <- round(mean( p_tree == test_data$churn),4)*100

accuracy_tree_p <- glue("{accuracy_tree}%")

Random Forest with Repeated K-Fold CV

Train Model

set.seed(42)
ctrl <- trainControl(method = "repeatedcv",
                     repeats = 5,
                     number = 5,
                     verboseIter = TRUE)
rf_model <- train(churn ~ .,
                    data = train_data,
                    method = "rf",
                    trControl = ctrl)
accuracy_rf_train <- round(max(rf_model$results$Accuracy), 4)*100
accuracy_rf_train_p <- glue("{accuracy_rf_train}%")

Test Model

p_rf <- predict(rf_model, newdata = test_data)

confusionMatrix(p_rf, test_data$churn, positive = "Yes",mode = "prec_recall")

accuracy_rf <- round(mean( p_rf == test_data$churn), 4) * 100
accuracy_rf_p <- glue("{accuracy_rf}%")

Conclusion Accuracy

conclusion <- data.frame(
  model = c("Logistic Regression", "Decision Tree", "Random Forest"),
  train_accuracy = c(accuracy_logit_train_p, accuracy_tree_train_p, accuracy_rf_train_p),
  test_accuracy = c(accuracy_logit_p, accuracy_tree_p, accuracy_rf_p)
)

Customer Churn Prediction Analysis in R

Introduction

About Dataset

Analysis in R Studio

Install and Load Package

Load Data

Data Preprocessing

Check data

Change column names to lowercase and Check missing value

Create a new data frame without the customerid and techsupport column and Removes all rows containing “None” values

Convert character data type to factor data type

Exploration Data Analysis

Create the individual plots by ggplot

Arrange the plots in a grid

Build Machine Learning Model

Split Data

Logistic Regression with K-Fold CV

Train Model

Test Model

Decision Tree with Repeated K-Fold CV

Train Model

Test Model

Random Forest with Repeated K-Fold CV

Train Model

Test Model

Conclusion Accuracy

Comments

Leave a comment Cancel reply

Customer Churn Prediction Analysis in R

Introduction

About Dataset

Analysis in R Studio

Install and Load Package

Load Data

Data Preprocessing

Check data

Change column names to lowercase and Check missing value

Create a new data frame without the customerid and techsupport column and Removes all rows containing “None” values

Convert character data type to factor data type

Exploration Data Analysis

Create the individual plots by ggplot

Arrange the plots in a grid

Build Machine Learning Model

Split Data

Logistic Regression with K-Fold CV

Train Model

Test Model

Decision Tree with Repeated K-Fold CV

Train Model

Test Model

Random Forest with Repeated K-Fold CV

Train Model

Test Model

Conclusion Accuracy

Share this:

Comments

Leave a comment Cancel reply