We provide a detailed demo of the usage for the package. This package implements the sure independence screening algorithm.
/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R # Sure Independence Screening{#sis}
SIS can be installed from Github or CRAN.
/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
Then we can load the package, as well as other relevant packages:
library(SIS)
library(pROC)
#> Type 'citation("pROC")' for a citation.
#>
#> Attaching package: 'pROC'
#> The following objects are masked from 'package:stats':
#>
#> cov, smooth, var/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
We will show in this section how to use the SIS package.
First, we generate a linear model with the first five predictors as signals.
set.seed(0)
n = 400
p = 50
rho = 0.5
corrmat = diag(rep(1 - rho, p)) + matrix(rho, p, p)
corrmat[, 4] = sqrt(rho)
corrmat[4, ] = sqrt(rho)
corrmat[4, 4] = 1
corrmat[, 5] = 0
corrmat[5, ] = 0
corrmat[5, 5] = 1
cholmat = chol(corrmat)
x = matrix(rnorm(n * p, mean = 0, sd = 1), n, p)
x = x %*% cholmat
# gaussian response
set.seed(1)
b = c(4, 4, 4, -6 * sqrt(2), 4/3)
y = x[, 1:5] %*% b + rnorm(n)/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
Next, we apply the SIS screening without iteration.
# SIS without regularization
model10 = SIS(x, y, family = "gaussian", iter = FALSE)
# Getting the final selected variables after regularization step
model10$ix
#> [1] 3 2 1 5 4
# Getting the ranked list of variables from the screening step
model10$sis.ix0
#> [1] 3 2 1 5 29 9 31 47 24 30 46 7 11 25 43 35 42 18 48 6 21 32 10 50 22
#> [26] 4 8 44 34 36 26 41 33 17 19 16 13 14 12 23 45 49 39 20 15 27 38 28 40 37
# The top 10 ranked variables from the screening step
model10$ix0[1:10]
#> [1] 3 2 1 5 29 9 31 47 24 30/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
Now, we apply the SIS screening with iteration and combined with SCAD penalty.
# SIS with regularization
model11 = SIS(x, y, family = "gaussian", penalty = "SCAD", iter = TRUE)
#> Iter 1 , screening: 3 2 1 5 29 9 31 47 24 30 46 7 11 25 43 35 42 18 48 6 21 32 10 50 22 4 8 44 34 36 26 41 33
#> Iter 1 , selection: 3 2 1 5 4
#> Iter 1 , conditional-screening: 24 13 45 42 16 37 27 25 39 47 7 26 23 6 35 21 44 36 31 30 11 40 19 48 15 18 46 38 10 33 28 49 8 22 14 32 12 41 20 43 17 34 50 29 9
#> Iter 2 , screening: 3 2 1 5 4 24 13 45 42 16 37 27 25 39 47 7 26 23 6 35 21 44 36 31 30 11 40 19 48 15 18 46 38 10 33 28 49 8 22 14 32 12 41 20 43 17 34 50 29 9
#> Iter 2 , selection: 3 2 1 5 4
#> Model already selected
# Getting the final selected variables
model10$ix
#> [1] 3 2 1 5 4
# The top 10 ranked variables for the final screening step
model11$ix0[1:10]
#> [1] 3 2 1 5 4 24 13 45 42 16
# The top 10 ranked variables for each screening step
lapply(model11$ix_list, f <- function(x) {
x[1:10]
})
#> [[1]]
#> [1] 3 2 1 5 29 9 31 47 24 30
#>
#> [[2]]
#> [1] 3 2 1 5 4 24 13 45 42 16/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
set.seed(2)
feta <- x[, 1:5] %*% b
fprob <- exp(feta)/(1 + exp(feta))
y <- rbinom(n, 1, fprob)
model21 <- SIS(x, y, family = "binomial", tune = "bic")
#> Iter 1 , screening: 1 3 2 5 29 30 31 25 17 13 14 32 9 18 27 26 4 48 43 44 47 7 23 46 21 12 6 20 42 15 33 35 22
#> Iter 1 , selection: 1 3 2 5 4
#> Iter 1 , conditional-screening: 11 7 17 27 46 41 26 49 47 9 24 14 40 10 36 42 34 8 28 39 21 20 18 31 13 45 43 32 25 30 16 23 22 50 33 15 19 38 37 29 44 35 12 48 6
#> Iter 2 , screening: 1 3 2 5 4 11 7 17 27 46 41 26 49 47 9 24 14 40 10 36 42 34 8 28 39 21 20 18 31 13 45 43 32 25 30 16 23 22 50 33 15 19 38 37 29 44 35 12 48 6
#> Iter 2 , selection: 1 3 2 5 4
#> Model already selected
# Getting the final selected variables
model21$ix
#> [1] 1 3 2 5 4
# The top 10 ranked variables for the final screening step
model11$ix0[1:10]
#> [1] 3 2 1 5 4 24 13 45 42 16
# The top 10 ranked variables for each screening step
lapply(model11$ix_list, f <- function(x) {
x[1:10]
})
#> [[1]]
#> [1] 3 2 1 5 29 9 31 47 24 30
#>
#> [[2]]
#> [1] 3 2 1 5 4 24 13 45 42 16/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
set.seed(4)
b <- c(4, 4, 4, -6 * sqrt(2), 4/3)
myrates <- exp(x[, 1:5] %*% b)
Sur <- rexp(n, myrates)
CT <- rexp(n, 0.1)
Z <- pmin(Sur, CT)
ind <- as.numeric(Sur <= CT)
y <- survival::Surv(Z, ind)
model41 <- SIS(x, y, family = "cox", penalty = "lasso", tune = "bic", varISIS = "aggr",
seed = 41)
#> Iter 1 , screening: 1 2 3 4 5 6 9 18 21 24 29 30 31 34 35 37 40 41 42 44 47 48
#> Iter 1 , selection: 1 2 3 4 5 6 9 18 21 24 29 30 31 35 37 40 41 42 44 47 48
#> Iter 1 , conditional-screening: 7 8 10 11 12 13 14 15 16 17 19 20 22 23 25 26 27 28 32 33 34 36 38 39 43 45 46 49 50
#> Iter 2 , screening: 1 2 3 4 5 6 9 18 21 24 29 30 31 35 37 40 41 42 44 47 48 7 8 10 11 12 13 14 15 16 17 19 20 22 23 25 26 27 28 32 33 34 36 38 39 43 45 46 49 50
#> Iter 2 , selection: 1 2 3 4 5 6 9 18 29 31 40 42 47 12 14 22 23 27 28 33 34 39 43 49 50
#> Iter 2 , conditional-screening: 7 8 10 11 13 15 16 17 19 20 21 24 25 26 30 32 35 36 37 38 41 44 45 46 48
#> Iter 3 , screening: 1 2 3 4 5 6 9 18 29 31 40 42 47 12 14 22 23 27 28 33 34 39 43 49 50 7 8 10 11 13 15 16 17 19 20 21 24 25 26 30 32 35 36 37 38 41 44 45 46 48
#> Iter 3 , selection: 1 2 3 4 5 6 9 18 29 31 40 42 47 12 14 22 23 27 28 33 34 39 43 49 50
#> Model already selected
model42 <- SIS(x, y, family = "cox", penalty = "lasso", tune = "bic", varISIS = "cons",
seed = 41)
#> Iter 1 , screening: 1 2 3 4 5 6 7 9 11 15 16 17 19 20 21 22 23 24 29 30 31 32 34 35 37 39 41 44 45 46 47 48 50
#> Iter 1 , selection: 1 2 3 4 5 6 9 22 23 24 29 31 34 37 39 46 47 48 50
#> Iter 1 , conditional-screening: 7 8 10 11 12 13 14 15 16 17 18 19 20 21 25 26 27 28 30 32 33 35 36 38 40 41 42 43 44 45 49
#> Iter 2 , screening: 1 2 3 4 5 6 9 22 23 24 29 31 34 37 39 46 47 48 50 7 8 10 11 12 13 14 15 16 17 18 19 20 21 25 26 27 28 30 32 33 35 36 38 40 41 42 43 44 45 49
#> Iter 2 , selection: 1 2 3 4 5 6 9 22 23 29 31 34 39 47 50 12 14 18 27 28 33 40 42 43 49
#> Iter 2 , conditional-screening: 7 8 10 11 13 15 16 17 19 20 21 24 25 26 30 32 35 36 37 38 41 44 45 46 48
#> Iter 3 , screening: 1 2 3 4 5 6 9 22 23 29 31 34 39 47 50 12 14 18 27 28 33 40 42 43 49 7 8 10 11 13 15 16 17 19 20 21 24 25 26 30 32 35 36 37 38 41 44 45 46 48
#> Iter 3 , selection: 1 2 3 4 5 6 9 22 23 29 31 34 39 47 50 12 14 18 27 28 33 40 42 43 49
#> Model already selected
model41$ix
#> [1] 1 2 3 4 5 6 9 18 29 31 40 42 47 12 14 22 23 27 28 33 34 39 43 49 50
model42$ix
#> [1] 1 2 3 4 5 6 9 22 23 29 31 34 39 47 50 12 14 18 27 28 33 40 42 43 49/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
y <- as.factor(iris$Species)
noise <- matrix(rnorm(nrow(iris) * 200), nrow(iris), 200)
x <- cbind(as.matrix(iris[, -5]), noise)
model21 <- SIS(x, y, family = "multinom", penalty = "lasso")
#> Iter 1 , screening: 4 3 1 2
#> Iter 1 , selection: 4 3 1 2
#> Iter 1 , conditional-screening: 112 66 106
#> Iter 2 , screening: 4 3 1 2 112 66 106
#> Iter 2 , selection: 4 3 1 2 112 66 106
#> Maximum number of variables selected
# Getting the final selected variables
model21$ix
#> [1] 4 3 1 2 112 66 106
# The top 10 ranked variables for the final screening step
model21$ix0[1:10]
#> [1] 4 3 1 2 112 66 106 NA NA NA
# The top 10 ranked variables for each screening step
lapply(model21$ix_list, f <- function(x) {
x[1:10]
})
#> [[1]]
#> [1] 4 3 1 2 10 72 86 199 14 52
#>
#> [[2]]
#> [1] 4 3 1 2 112 66 106 46 49 100/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
# Loading data: Gene expression data from 7129 genes and 38 patients with acute leukemias (27 in class acute lymphoblastic leukemia and 11 in class acute myeloid leukemia) from the microarray study of Golub et al. (1999). These data can be found in: http://wwwprod.broadinstitute.org/cgi-bin/cancer/datasets.cgi
# Getting the predictors and response variables
x_train <- as.matrix(leukemia.train[,1:7129])
y_train <- leukemia.train[,7130]
x_test <- as.matrix(leukemia.test[,1:7129])
y_test <- leukemia.test[,7130]
# Calling SIS and calculating the time taken for the algorithm to run
start.time <- Sys.time()
sis <- SIS(x_train, y_train, family = "binomial", penalty='enet',
tune='cv', nfolds = 10, iter = TRUE, iter.max = 10,
seed = 123, nsis=dim(x_train)[1]/2, standardize = TRUE,
boot_ci=FALSE)
#> Iter 1 , screening: 3320 4847 2020 1745 5039 1834 461 4196 3847 2288 1249 6201
#> Iter 1 , selection: 3320 4847 2020 1745 5039 1834 461 4196 3847 2288 1249 6201
#> Iter 1 , conditional-screening: 5954 1779 3328 2674 5788 4444 4025
#> Iter 2 , screening: 3320 4847 2020 1745 5039 1834 461 4196 3847 2288 1249 6201 5954 1779 3328 2674 5788 4444 4025
#> Iter 2 , selection: 3320 4847 2020 1745 5039 1834 461 4196 3847 2288 1249 6201 5954 1779 3328 2674 5788 4444 4025
#> Maximum number of variables selected
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 5.969336 secs/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
# Getting the AUC in the test set
pred <- predict(sis, x_test, type = "class")
auc(pred[, 1], y_test)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
#> Area under the curve: 0.9348
# Bootstrap confidence intervals are omitted in the vignette build to
# keep the example deterministic and CRAN-stable./private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
# Loading data: Gene expression data from 12600 genes and 52 patients with prostate tumors and 50 normal specimens from the microarray study of Singh et al. (2002). These data can be found in: \source{http://wwwprod.broadinstitute.org/cgi-bin/cancer/datasets.cgi}
# Getting the predictors and response variables
x_train <- as.matrix(prostate.train[,1:12600])
y_train <- prostate.train[,12601]
x_test <- as.matrix(prostate.test[,1:12600])
y_test <- prostate.test[,12601]
# Calling SIS and calculating the time taken for the algorithm to run
start.time <- Sys.time()
sis <- SIS(x_train, y_train, family = "binomial", penalty='enet',
nfolds = 10, iter = TRUE, iter.max = 10,tune='cv',
seed = 123, nsis=dim(x_train)[1]/2, standardize = TRUE,
boot_ci=FALSE)
#> Iter 1 , screening: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 7557 11942 9341 7520 3649 5890 3879 7905 299 6865
#> Iter 1 , selection: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 11942 9341 7520 3649 5890 3879 7905 299 6865
#> Iter 1 , conditional-screening: 2515 1253 11091 512 1126 10213 8205 7217 3459 10615 2245 10137 7293 4903 2775 1536 8406 9240
#> Iter 2 , screening: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 11942 9341 7520 3649 5890 3879 7905 299 6865 2515 1253 11091 512 1126 10213 8205 7217 3459 10615 2245 10137 7293 4903 2775 1536 8406 9240
#> Iter 2 , selection: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 11942 9341 7520 3649 5890 3879 7905 299 6865 2515 1253 11091 512 1126 10213 8205 7217 3459 10615 2245 10137 7293 2775 8406 9240
#> Iter 2 , conditional-screening: 10949 9319
#> Iter 3 , screening: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 11942 9341 7520 3649 5890 3879 7905 299 6865 2515 1253 11091 512 1126 10213 8205 7217 3459 10615 2245 10137 7293 2775 8406 9240 10949 9319
#> Iter 3 , selection: 6185 8965 4365 10138 6866 9172 8123 7067 12148 9050 9850 10494 8850 10956 288 10537 8631 12153 9034 7247 10553 8058 205 3794 11942 9341 7520 3649 5890 3879 7905 299 6865 2515 1253 11091 512 1126 10213 8205 7217 3459 10615 2245 10137 7293 2775 8406 9240 10949 9319
#> Maximum number of variables selected
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
#> Time difference of 2.093644 mins/private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R
# Getting the AUC in the test set
pred <- predict(sis, x_test, type = "class")
auc(pred[, 1], y_test)
#> Setting levels: control = 0, case = 1
#> Setting direction: controls < cases
#> Area under the curve: 0.95
# Bootstrap confidence intervals are omitted in the vignette build to
# keep the example deterministic and CRAN-stable./private/var/folders/6k/mkz8wbnx2gb59d41543gq83h0000gn/T/RtmpXDwynq/Rbuild5e73174443c/SIS/vignettes/sis-demo.R