forked from dataprofessor/code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dhfr-parallel-speed-up.R
111 lines (78 loc) · 2.71 KB
/
dhfr-parallel-speed-up.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
####################################
# Data Professor #
# http://youtube.com/dataprofessor #
# http://github.com/dataprofessor #
####################################
# Importing libraries
library(datasets) # Contains several data sets
library(caret) # Package for machine learning algorithms / CARET stands for Classification And REgression Training
# Importing the dhfr data set
data(dhfr)
# Check to see if there are missing data?
sum(is.na(dhfr))
# To achieve reproducible model; set the random seed number
set.seed(100)
# Performs stratified random split of the data set
TrainingIndex <- createDataPartition(dhfr$Y, p=0.8, list = FALSE)
TrainingSet <- dhfr[TrainingIndex,] # Training Set
TestingSet <- dhfr[-TrainingIndex,] # Test Set
###############################
# Random forest
# Run normally without parallel processing
start.time <- proc.time()
Model <- train(Y ~ .,
data = TrainingSet, # Build model using training set
method = "rf" # Learning algorithm
)
stop.time <- proc.time()
run.time <- stop.time - start.time
print(run.time)
# Use doParallel
# https://topepo.github.io/caret/parallel-processing.html
library(doParallel)
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
start.time <- proc.time()
Model <- train(Y ~ .,
data = TrainingSet, # Build model using training set
method = "rf" # Learning algorithm
)
stop.time <- proc.time()
run.time <- stop.time - start.time
print(run.time)
stopCluster(cl)
##########################
# Run without parallel processing
start.time <- proc.time()
Model <- train(Y ~ .,
data = TrainingSet, # Build model using training set
method = "rf", # Learning algorithm
tuneGrid = data.frame(mtry = seq(5,15, by=5))
)
stop.time <- proc.time()
run.time <- stop.time - start.time
print(run.time)
# Using doParallel
library(doParallel)
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
start.time <- proc.time()
Model <- train(Y ~ .,
data = TrainingSet, # Build model using training set
method = "rf", # Learning algorithm
tuneGrid = data.frame(mtry = seq(5,15, by=5))
)
stop.time <- proc.time()
run.time <- stop.time - start.time
print(run.time)
stopCluster(cl)
##########################
# Apply model for prediction
Model.training <-predict(Model, TrainingSet) # Apply model to make prediction on Training set
# Model performance (Displays confusion matrix and statistics)
Model.training.confusion <-confusionMatrix(Model.training, TrainingSet$Y)
print(Model.training.confusion)
# Feature importance
Importance <- varImp(Model)
plot(Importance, top = 25)
plot(Importance, col = "red")