Student Performance Analysis and Prediction in R

  1. Introduction
    1. About Dataset
  2. Analysis in R Studio
    1. Install and Load Package
    2. Load Data
    3. Data Preprocessing
      1. Change column names to lowercase and Check missing value
      2. Delete Missing Value
      3. Select only columns that are NOT character data type
      4. Convert character data type to factor data type
    4. Exploration Data Analysis
      1. Create the individual plots by ggplot
      2. Arrange the plots in a grid
      3. Correlation Matrix
    5. Build Machine Learning Model
      1. Split Data
      2. Linear regression with K-Fold CV
        1. Train Model
        2. Test Model
      3. KNN with K-Fold CV
        1. Train Model
        2. Test Model
      4. Random Forest with K-Fold CV
        1. Train Model
        2. Test Model
    6. Conclusion RMSE

    Introduction

    Student Performance Factors by Kaggle

    About Dataset

    This dataset provides a comprehensive overview of various factors affecting student performance in exams. It includes information on study habits, attendance, parental involvement, and other aspects influencing academic success.

    • Hours_Studied : Number of hours spent studying per week.
    • Sleep_Hours : Average number of hours of sleep per night.
    • Previous_Scores : Scores from previous exams.
    • Family_Income : Family income level (Low, Medium, High).
    • Parental_Education_Level : Highest education level of parents (High School, College, Postgraduate).
    • Distance_from_Home : Distance from home to school (Near, Moderate, Far).
    • Gender: Gender of the student (Male, Female).
    • Exam_Score : Final exam score.
    • Attendance : Percentage of classes attended.
    • Tutoring_Sessions : Number of tutoring sessions attended per month.

    Analysis in R Studio

    Install and Load Package

    # Install package
    install.packages("tidyverse")
    install.packages("caret")
    install.packages("gridExtra")
    install.packages("metan")
    
    # Load package 
    
    library("tidyverse")
    library("caret")
    library("gridExtra")
    library("metan")
    
    

    Load Data

    df <- read_csv("StudentPerformanceFactors.csv")
    

    Data Preprocessing

    glimpse(df)
    head(df)
    tail(df)
    summary(df %>%
              select_if(is.numeric))
    
    

    Change column names to lowercase and Check missing value

    names(df) <- tolower(names(df))
    names(df)
    
    sum(is.na(df))
    

    Delete Missing Value

    df <- na.omit(df)
    sum(is.na(df))
    

    Select only columns that are NOT character data type

    df_numeric <- df %>%
      select_if(negate(is.character))
    

    Convert character data type to factor data type

    df$family_income <- factor(df$family_income,
                           levels = c("Low","Medium","High"),
                           labels = c("Low","Medium","High"),
                           ordered = T)
    df$parental_education_level <- factor(df$parental_education_level,
                               levels = c("High School","College","Postgraduate"),
                               labels = c("High School","College","Postgraduate"),
                               ordered = T)
    df$gender <- as.factor(df$gender)
    
    

    Exploration Data Analysis

    Create the individual plots by ggplot

    p1 <- ggplot(df, aes(hours_studied, fill = gender)) + 
      geom_bar() +
      theme_minimal() +
      facet_grid(gender ~ .) +
      labs(title = "Studied hours per week by Gender",
           x = "Studied hours") + 
      theme(plot.title = element_text(hjust = 0.5))
    
    
    p2 <- ggplot(df, aes(sleep_hours, fill = gender)) + 
      geom_bar(position = position_dodge()) +
      theme_minimal() +
      labs(title = "Average number of hours of sleep per night by Gender",
         x = "Sleep hours") + 
      theme(plot.title = element_text(hjust = 0.5))
    
    
    p3 <- ggplot(df, aes(parental_education_level  , fill = gender)) + 
      geom_bar(position = position_dodge()) +
      theme_minimal() +
      labs(title = "Parental education level by Gender",
           x = "Parental education level") + 
      theme(plot.title = element_text(hjust = 0.5))
    
    
    p4 <- ggplot(df, aes(family_income  , fill = gender)) + 
      geom_bar(position = position_dodge()) +
      theme_minimal() +
      labs(title = "Family income by Gender",
           x = "Family income") + 
      theme(plot.title = element_text(hjust = 0.5))
    
    
    p5 <- ggplot(df, aes(peer_influence  , fill = gender)) + 
      geom_bar(position = position_dodge()) +
      theme_minimal() +
      labs(title = "Influence of peers by Gender",
           x = "Peer influence") + 
      theme(plot.title = element_text(hjust = 0.5))
    
    
    p6 <- ggplot(df, aes(exam_score, fill  = gender)) + 
      geom_histogram(color = "white") +
      theme_minimal() +
      facet_grid(gender ~ .) +
      labs(title = "Exam score by Gender",
           x = "Exam score") + 
      theme(plot.title = element_text(hjust = 0.5))
    

    Arrange the plots in a grid

    grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
    

    Correlation Matrix

    Build Machine Learning Model

    Split Data

    Train Data 80 % and Test Data 20 %

    set.seed(42) 
    n <- nrow(df)
    id <- sample(1:n, size = 0.8*n, replace = FALSE)
    train_data <- df[id, ]
    test_data <- df[-id, ]
    

    Linear regression with K-Fold CV

    Train Model
    set.seed(42)
    ctrl <- trainControl(method = "cv",
                         number = 5,
                         verboseIter = TRUE)
    lm_model <- train(exam_score ~ previous_scores + hours_studied + attendance + tutoring_sessions,
                      data = train_data,
                      method = "lm",
                      trControl = ctrl)
    
    
    Test Model
    p_lm <- predict(lm_model, newdata = test_data)
    
    rmse_lm <- sqrt(mean( (p_lm - test_data$exam_score)**2 ))
    
    
    
    
    

    KNN with K-Fold CV

    Train Model
    set.seed(42)
    ctrl <- trainControl(method = "cv",
                         number = 5,
                         verboseIter = TRUE)
    knn_model <- train(exam_score ~ previous_scores + hours_studied + attendance + tutoring_sessions,
                       data = train_data,
                       method = "knn",
                       trControl = ctrl,
                       tuneLength = 4)
    
    Test Model
    p_knn <- predict(knn_model, newdata = test_data)
    
    rmse_knn <- sqrt(mean( (p_knn - test_data$exam_score)**2 ))
    

    Random Forest with K-Fold CV

    Train Model
    set.seed(42)
    ctrl <- trainControl(method = "cv",
                         number = 5,
                         verboseIter = TRUE)
    rf_model <- train(exam_score ~ previous_scores + hours_studied + attendance + tutoring_sessions,
                       data = train_data,
                       method = "rf",
                       trControl = ctrl)
    
    Test Model
    p_rf <- predict(rf_model, newdata = test_data)
    
    rmse_rf <- sqrt(mean( (p_rf - test_data$exam_score)**2 ))
    

    Conclusion RMSE

    conclusion <- data.frame(
      model = c("Linear regression with K-Fold CV","KNN with K-Fold CV", "Random Forest with K-Fold CV "),
    rmse_train = c("2.543691", "2.698024", "2.677984"),
    rmse_test = c("2.218041", "2.397026", "2.383069")
    )
    
    print(conclusion)
    


    Comments

    Leave a comment