forked from dataprofessor/code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dhfr-handling-missing-data.R
96 lines (54 loc) · 1.87 KB
/
dhfr-handling-missing-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
####################################
# Data Professor #
# http://youtube.com/dataprofessor #
# http://github.com/dataprofessor #
####################################
# 1. Loading the DHFR data
library(RCurl)
dhfr <- read.csv(text = getURL("https://raw.githubusercontent.com/dataprofessor/data/master/dhfr.csv") )
View(dhfr)
# 2. Check for missing data
sum(is.na(dhfr))
# 3. If data is clean, randomly introduce NA to the dataset
na.gen <- function(data,n) {
i <- 1
while (i < n+1) {
idx1 <- sample(1:nrow(data), 1)
idx2 <- sample(1:ncol(data), 1)
data[idx1,idx2] <- NA
i = i+1
}
return(data)
}
# Before introducing NA to the dataset, leave the Y class label (output variable) out
dhfr <- dhfr[,-1]
# Choose 1 of the following to run (they'll produce the same result)
dhfr <- na.gen(dhfr,100)
dhfr <- na.gen(n=100,data=dhfr)
dhfr <- na.gen(100,dhfr) # This produces an error, why?
# 4. Check again for missing data
sum(is.na(dhfr))
colSums(is.na(dhfr))
str(dhfr)
# Lists rows with missing data
missingdata <- dhfr[!complete.cases(dhfr), ]
sum(is.na(missingdata))
# If above sum is 0, this means that there is no missing data and proceed to modeling.
# If above sum is greater than 0, then proceed to # 5
# 5. Handling the missing data. There are 2 options, decide and choose only 1
# 5.1. Simply delete all entries with missing data
clean.data <- na.omit(dhfr)
sum(is.na(clean.data))
# 5.2. Imputation: Replace missing values with the column's
# MEAN
dhfr.impute <- dhfr
for (i in which(sapply(dhfr.impute, is.numeric))) {
dhfr.impute[is.na(dhfr.impute[, i]), i] <- mean(dhfr.impute[, i], na.rm = TRUE)
}
sum(is.na(dhfr.impute))
# MEDIAN
dhfr.impute <- dhfr
for (i in which(sapply(dhfr.impute, is.numeric))) {
dhfr.impute[is.na(dhfr.impute[, i]), i] <- median(dhfr.impute[, i], na.rm = TRUE)
}
sum(is.na(dhfr.impute))