This video shows a comparison of Missing Data Imputation & Listwise Deletion in the R programming language. Check out my online course on Missing Data Imputation in R: statisticsglobe.com/online-workshop-missing-data-i…
R code of this video:
install.packages("ggplot2") # Install & load ggplot2
library(ggplot2)
install.packages("mice") # Install & load mice
library(mice)
set.seed(875526) # Seed for reproducibility
n_sample <- 100000 # Specify sample size
x1 <- round(rnorm(n_sample), 2) # Create x variables
x2 <- round(rnorm(n_sample) + 0.1 * x1, 2)
x3 <- round(rnorm(n_sample) - 0.4 * x1 - 0.1 * x2, 2)
x4 <- round(rnorm(n_sample) - 0.2 * x1 - 0.3 * x2 + 0.1 * x3, 2)
y <- round(rnorm(n_sample) + 0.2 * x1 + 0.1 * x2 - 0.3 * x3 + 0.1 * x4, 2) # y
df_true <- data.frame(x1, x2, x3, x4, y) # Create true data frame
head(df_true) # Head of true data
df_mcar <- df_true # Insert MCAR missingness
df_mcar$y[sample(1:n_sample, 0.25 * n_sample)] <- NA
head(df_mcar) # Head of MCAR data
df_mar <- df_true # Insert MAR missingness
rm_miss_mar <- - 0.8 * x1 - 0.7 * x2 + 0.9 * x3 - 0.8 * x4 + rnorm(n_sample)
dummy_miss_mar <- rep(1, n_sample)
dummy_miss_mar[rm_miss_mar < quantile(rm_miss_mar)[2]] <- 0
df_mar$y[dummy_miss_mar == 0] <- NA
head(df_mar) # Head of MAR data
df_mcar_list <- na.omit(df_mcar) # Listwise deletion
df_mar_list <- na.omit(df_mar)
dim(df_mcar_list) # Rows after deletion
dim(df_mar_list)
round(mean(df_true$y), 3) # Means after deletion
round(mean(df_mcar_list$y), 3)
round(mean(df_mar_list$y), 3)
ggplot() + # Densities after deletion
geom_density(data = df_true, aes(x = y, color = "True Values")) +
geom_density(data = df_mcar_list, aes(x = y, color = "MCAR")) +
geom_density(data = df_mar_list, aes(x = y, color = "MAR")) +
guides(color = guide_legend(title = NULL)) +
ggtitle("Densities after Listwise Deletion")
df_mcar_mice <- complete(mice(df_mcar, m = 1)) # Imputation
df_mar_mice <- complete(mice(df_mar, m = 1))
dim(df_mcar_mice) # Rows after deletion
dim(df_mar_mice)
round(mean(df_true$y), 3) # Means after imputation
round(mean(df_mcar_mice$y), 3)
round(mean(df_mar_mice$y), 3)
ggplot() + # Densities after deletion
geom_density(data = df_true, aes(x = y, color = "True Values")) +
geom_density(data = df_mcar_mice, aes(x = y, color = "MCAR")) +
geom_density(data = df_mar_mice, aes(x = y, color = "MAR")) +
guides(color = guide_legend(title = NULL)) +
ggtitle("Densities after mice Imputation")
Follow me on Social Media:
LinkedIn – Joachim Schork Profile: www.linkedin.com/in/joachim-schork/LinkedIn – Statistics Globe Page: www.linkedin.com/company/statisticsglobe/
LinkedIn – R Programming Group for Discussions & Questions: www.linkedin.com/groups/12555223/
LinkedIn – Python Programming Group for Discussions & Questions: www.linkedin.com/groups/12673534/
X (Formerly Twitter): x.com/JoachimSchork
Facebook – Joachim Schork Profile: www.facebook.com/joachim.schork/Facebook – Statistics Globe Page: www.facebook.com/statisticsglobecom/
Facebook – R Programming Group for Discussions & Questions: www.facebook.com/groups/statisticsglobe
Facebook – Python Programming Group for Discussions & Questions: www.facebook.com/groups/statisticsglobepython
Instagram: www.instagram.com/statisticsglobecom/
TikTok: www.tiktok.com/@statisticsglobe
コメント