Hello,
I am trying to run a coxph model but get an error
Error in model.frame.default(formula = Surv(time, status) ~
selectedVarnames, :
variable lengths differ (found for 'selectedVarnames')
Of note the dataset is generated as part of using the glmnet for Lasso
regularization.
Glmnet takes in as input a matrix where all categorical variables have been
converted to binary factors with dummies.
As a result I have had to do some data manipulations that are probably the
origin of the error.
Original dataframe -> transform categorical to binary -> transform to
matrix
-> run glment -> get best features -> selected features + status + time
->
transform to dataframe -> run coxph
The coxph code (entire code below):
glmnet.cox <- coxph(Surv(time,status) ~ selectedVarnames,
data=reformat_dataSet, init=selectedBeta,iter=200)
Here is a description of the data (I can't attach the real thing as it is
protected health data):
selectedVar: 394X81 Double Matrix
reformat_dataSet: 394 obs. 83 varaibles (dataframe including 59 predicting
variables, time and status)
selectedBeta: numeric [81]
selectedVarnames:
"FABM0+FABM1+FABM2+FABM4+FABM4EOS+SEXF+SEXM+Age_at_Dx+Performance_Status0+Pe
rformance_Status2+Performance_Status3+Performance_Status4+AHD_typeabnormal
counts_MDS+AHD_typeALL_MDS+AHD_typecytopenia+AHD_typelow PLT+AHD_typelow
white
count+AHD_typeMPD+AHD_typepancytopenia+PRIOR_MALNO+PRIOR_MALYES+PRIOR_CHEMON
O+PRIOR_XRTNO+PRIOR_XRTYES+cyto_grpUNFAVORABLE+Cyto5+Cyto7+Cyto8+Cyto10+Cyto
11+Cyto12+ITDPOS+D835POS+ABS_BLST+BM_PROM+PB_MONO+PB_PROM+HGB+PLT+ALBUMIN+CR
EATININE+CD13+CD34+CD19+Fresh_or__CryoCryo+AKT1_2_3_pT308+ARC+ATF3+BECN1+BIR
C2+CAV1+CDKN2A+CTNNB1+FOXO3_S318_321+GAB2+GRP78+IGFBP2+INPP5D+JMJD6+JUN_pS73
+KIT+LYN+MAP2K1_2_pS217_221+NRP1+PIK3CA+PLAC1+PRKCA+PRKCD_pT507+PTEN_pS380T3
82T383+RAC1_2_3+RPS6KB1+SMAD1+SMAD2_pS245+SMAD5+SMAD5_pS463+STAT3+STMN1+TP53
+TRIM62+VASP+XPO1"
dataset: 394 obs. 270 variables
predict_matrix: 394X392 double matrix
THANK YOU!!!
The entire code (without the data though):
library("survival")
library("pec")
library ("glmnet")
library ("peperr")
library ("Hmisc")
cIndexCoxglmnet <- list()
for (i in 1:50){
train <- sample(1:nrow(dataset), nrow(dataset), replace = TRUE) ## random
sampling with replacement
trainSet <-dataset [train,]
testSet<-dataset [-train,]
cat ("\n","ITERATION:",i,"\n")
#creat Y (survival matrix) for glmnet
surv_obj <- Surv(trainSet$time,trainSet$status)
## tranform categorical variables into binary variables with dummy for
trainSet
predict_matrix <- model.matrix(~ ., data=trainSet,
contrasts.arg = lapply
(trainSet[,sapply(trainSet, is.factor)], contrasts, contrasts=FALSE))
## remove the statu/time variables from the predictor matrix (x) for
glmnet
predict_matrix <- subset (predict_matrix, select=c(-time,-status))
## create a glmnet cox object using lasso regularization and cross
validation
glmnet.cv <- cv.glmnet (predict_matrix, surv_obj, family="cox")
## get the glmnet model on the full dataset
glmnet.obj <- glmnet.cv$glmnet.fit
# find lambda index for the models with least partial likelihood deviance
(by cv.glmnet)
optimal.lambda <- glmnet.cv$lambda.min # For a more parsimoneous model
use lambda.1se
lambda.index <- which(glmnet.obj$lambda==optimal.lambda)
# take beta for optimal lambda
optimal.beta <- glmnet.obj$beta[,lambda.index]
# find non zero beta coef
nonzero.coef <- abs(optimal.beta)>0
selectedBeta <- optimal.beta[nonzero.coef]
# take only covariates for which beta is not zero
selectedVar <- predict_matrix[,nonzero.coef]
# create a dataframe for trainSet with time, status and selected variables
in binary representation for evaluation in pec
reformat_dataSet <- as.data.frame(cbind(surv_obj,selectedVar))
# names of selectedVars
selectedVarnames<-paste(colnames(selectedVar),collapse="+")
# create coxph object with pre-defined coefficients
glmnet.cox <- coxph(Surv(time,status) ~ selectedVarnames,
data=reformat_dataSet, init=selectedBeta,iter=200)
## create datasets based on the testSet fit for glmnet models for testing
in pec function
## !!!encountered tech problems in variables selection for unknown reason
so the code is somewhat redundant !!!
#creat Y (survival matrix) for glmnet
surv_obj_test <- Surv(testSet$time,testSet$status)
## tranform categorical variables into binary variables with dummy for
testSet
predict_matrix_test <- model.matrix(~ ., data=testSet,
contrasts.arg = lapply
(testSet[,sapply(testSet, is.factor)], contrasts, contrasts=FALSE))
## remove the statu/time variables from the predictor matrix (x) for
glmnet
predict_matrix_test <- subset (predict_matrix_test,
select=c(-time,-status))
## remove the statu/time variables from the predictor matrix (x) for
glmnet
selectedVar_test <- predict_matrix_test [,nonzero.coef]
# create a dataframe for trainSet with time, status and selected variables
in binary representation for evaluation in pec
reformat_testSet <- as.data.frame(cbind(surv_obj_test,selectedVar_test))
refrmat_matrix <- cbind(surv_obj_test,selectedVar_test)
## compute c-index (Harrell's) for cox-glmnet models
if (is.null(glmnet.cox)){
cIndexCoxglmnet <- c(cIndexCoxglmnet,NULL)
}else{
cIndexCoxglmnet <- c(cIndexCoxglmnet, rcorr.cens(predict(glmnet.cox,
reformat_testSet),Surv(reformat_testSet$time,reformat_testSet$status))[1])
}
}
[[alternative HTML version deleted]]