Description

HSIC-Lasso feature selection on the mRNA microArray data using censoring data

Loading of libraries

Libraries needed to run the code must be loaded first

Loading/Splitting training and testing sets

Data is separated into training and testing

######## Input ###############

Clinical_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/Clinical_TrainingSet.csv")
mRNAArray_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/mRNAArray_TrainingSet.csv")
Clinical_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/Clinical_TestSet.csv")
mRNAArray_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/mRNAArray_TestSet.csv")
indexes.train<-which(is.na(Clinical_TrainingSet$days_to_death), arr.ind=TRUE)
indexes.test<-which(is.na(Clinical_TestSet$days_to_death), arr.ind=TRUE)
training_time<-Clinical_TrainingSet$days_to_death
test_time<-Clinical_TestSet$days_to_death
for (i in indexes.train){
  training_time[i]<-Clinical_TrainingSet$days_to_last_followup[i]
}
for (i in indexes.test){
  test_time[i]<-Clinical_TestSet$days_to_last_followup[i]
}
total_time<-append(training_time,test_time)

n<-length(Clinical_TrainingSet$X1)
n.new<-length(Clinical_TestSet$X1)


#### Overall Survival Time Vector (N*1)
time<-total_time[1:n]
time<-log(time)
########### Event or Not Vector (N * 1) ####
censoring<-Clinical_TrainingSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TrainingSet$X1<-NULL
Y.pre.train<-mRNAArray_TrainingSet

#### Overall Survival Time Vector (N*1)
time.new<-total_time[-(1:n)]
time.new<-log(time.new)
########### Event or Not Vector (N * 1) ####
censoring.new<-Clinical_TestSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TestSet$X1<-NULL
Y.pre.test<-mRNAArray_TestSet

print(paste0("Number of patients in training set: ",as.character(n)))
## [1] "Number of patients in training set: 160"
print(paste0("Number of patients in Validation set: ",as.character(n.new)))
## [1] "Number of patients in Validation set: 261"

HSIC-Lasso feature selection

HSIC<- read_csv("/Volumes/GoogleDrive/My Drive/Documents/GitHub/MasterThesis/hsic_regression_verhaak_labels.csv", 
    col_types = cols(X1 = col_skip()))
HSIC<-HSIC$`0`
Y.pre.train<-Y.pre.train[,HSIC]
Y.pre.test<-Y.pre.test[,HSIC]

Data visualization

Censoring

table(censoring) %>%  
  kable(caption = "Censoring frequency for the training set") %>%
  kable_styling(bootstrap_options = c("striped", "hover","responsive"),full_width = T, position = "center")

Censoring frequency for the training set
censoring	Freq
0	6
1	154

table(censoring.new) %>%
  kable(caption = "Censoring frequency for the testing set") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")

Censoring frequency for the testing set
censoring.new	Freq
0	48
1	213

Header of mRNA matrixes

  Y.pre.train[1:5,1:5] %>%
  kable(caption = "mRNA MicroArray data for the Training set " ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")

mRNA MicroArray data for the Training set
DHRS2	RGS16	SCGB1A1	DYNLT3	H1F0
-0.5017290	-0.6184856	3.3178606	0.5223439	-1.3289853
-0.0513965	-1.6363537	-0.1685734	-0.0605536	1.0742208
1.7270245	0.3571045	-0.0483781	0.7516696	1.2437621
0.1040074	-1.7638347	5.7551348	-0.8997272	0.5488968
0.1947272	0.0395339	-0.2403982	-0.9498692	-3.3091073

  Y.pre.test[1:5,1:5] %>%
  kable(caption = "mRNA MicroArray data for the Testing set " ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")

mRNA MicroArray data for the Testing set
DHRS2	RGS16	SCGB1A1	DYNLT3	H1F0
1.4784974	1.4062304	-0.0729953	0.9441281	2.0828987
-0.5685385	0.4179473	-0.4277503	-0.0472059	0.5573085
-0.1304195	0.0586602	-0.0007057	0.8285561	-0.0179193
-0.0522900	1.5930622	-0.1923557	1.5438299	1.1767911
-0.2443608	0.9798317	0.1303436	-0.9291382	-0.2394585

PCA of training and Validation set

          
Y.pre.train<-scale(Y.pre.train,scale=TRUE,center=TRUE)
Y.pre.test<-scale(Y.pre.test,scale=TRUE,center=TRUE)


totalmRNA<-rbind(Y.pre.train,Y.pre.test)
mRNA.pca<-as.data.frame(prcomp(totalmRNA,scale=TRUE, center= TRUE)$x[,1:2])
classes<-c()
for (i in 1:length(Clinical_TrainingSet$days_to_death)){
    classes<-append(classes,"Training")
  }
for (i in 1:length(Clinical_TestSet$days_to_death)){
    classes<-append(classes,"Validation")
  }

mRNA.pca$Classes<-classes
p1 <- ggplot(mRNA.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n GBM-TCGA Data set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes") 
p1

png(file="HSIC-Lasso-Regression_files/PCA_Train&Test.pdf")
p1
dev.off()
## quartz_off_screen 
##                 2

Signature calculation

This chunk calculates SBC gene signature It’s based on the idea of Univariate testing of Survival Data features

######## Prefiltering of the Genes ############################### ###########################
######## Using Univariate Cox's Model for Ranking of the Survival Power of the Genes ########

  surv.obj <- Surv(time,censoring)
  coeff.sig <- c(0)
  
  pvalue.sig <- c(0)
  
  
  calcCox = function(x){
    q1 <- unlist(summary(coxph(surv.obj ~ ., data = as.data.frame(x))))
    return(q1$logtest.pvalue)
  }
  
  
  pvalue.sig <- apply(Y.pre.train,2,calcCox)
  
  
  ###### Adjusting p-values for Multiple Test Correction
  pvalue.sig.adj <- p.adjust(pvalue.sig, method = "fdr")
  
  #### As the number of features are quite variable choose first a very loose cut-off 
  
  signature.loose <- colnames(Y.pre.train)[(pvalue.sig.adj < params$pval.sbc)] 
  
  ### Combined the P-values
  pvalue.combined <- (pvalue.sig.adj) 
  names(pvalue.combined) <- colnames(Y.pre.train)
  ## Sort it
  pvalue.combined.sort <- sort(pvalue.combined)
  ## Only select those genes which are loosely in the signature
  pvalue.combined.adj <- pvalue.combined.sort[names(pvalue.combined.sort) %in% signature.loose]
  
  
  ### Take the top 50 genes ####
  signature.sbc <- names(pvalue.combined.adj[1:50])

The SBC signature on the dataset looks like this

  to_plot<-data.frame(Genes=signature.sbc)
   to_plot%>%
     kable(caption = "SBC Signature") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")%>%
     scroll_box(width = "500px", height = "200px")

SBC Signature
Genes
ANKRD26
THNSL1
NET1
REM1
HEMK1
DHRS2
FKBP6
PCBP3
NSBP1
CBLN1
SUPV3L1
BDH1
PHF11
EIF4EBP2
DYNLT3
TAF5
HIST3H2A
SNX10
ATF3
ZNF37A
RGS16
TRIM48
AKAP12
EPHA7
ECHDC2
ZNF205
RGR
MDFI
CLTCL1
DNM2
TRIM14
RAC3
BCAT1
H1F0
GFRA1
GRIN2C
ARG2
DCLRE1B
RAGE
KLF13
FOXO4
MMP13
GPR176
RUSC2
ADARB2
ATN1
LBP
HAPLN1
HIST1H4E
PLAT

Survival curves for both training and testing set

surv.fit <- survfit(Surv(total_time,append(Clinical_TrainingSet$vital_status,Clinical_TestSet$vital_status))~ classes)
p3 <- ggsurv(surv.fit, plot.cens=FALSE,main = " DPMM \n Kaplan Meier curves for training and validation set")+ ggplot2::guides(linetype = FALSE)
p3

png(file="HSIC-Lasso-Regression_files/KMC_Train&Test.pdf")
p3
dev.off()
## quartz_off_screen 
##                 2

PCA of the signature genes


Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]
Y<-scale(Y,scale=TRUE,center=TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)

mRNA.signature<-rbind(Y,Y.new)
mRNA.signature.pca<-as.data.frame(prcomp(mRNA.signature,scale=TRUE, center= TRUE)$x[,1:2])
mRNA.signature.pca$Classes<-classes
p6 <- ggplot(mRNA.signature.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n SBC Signature mRNAArray Data of GBM") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes") 
p6

png(file="HSIC-Lasso-Regression_files/PCASignature_Train&Test.pdf")
p6
dev.off()
## quartz_off_screen 
##                 2

Training

Preparation of the data needed for the training of the model

########################################################################
###### We prepare the Data Structures Needed for the Running of the SBC ####
#############################################################################

######## Verhaak Signature ###############
Verhaak_gene_signature <-read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Verhaak_Data/Verhaak_GS_according_to_Training_Set_Split1.csv", 
                                  col_names = FALSE, col_types = cols(X1 = col_skip()))
signature.vk<-Verhaak_gene_signature$X2
######## Verhaak Labels #################
labels.vk<-Clinical_TrainingSet$Subtype

#######Getting signature matrix on training and testing######
Y <- Y.pre.train[,signature.sbc]
c.true <- as.factor(labels.vk)
c.true<-unclass(c.true)

Y.new <- Y.pre.test[,signature.sbc]

Training results

##############################################################################
############ This file takes in training data for Glioblastoma #################
################################################################################

###### Scaling the data #########
Y <- scale(Y, center = TRUE, scale = TRUE)

######
N <- nrow(Y)
D <- ncol(Y)
smod <-  Surv(exp(time), censoring)

##### Initial number of clusters
k =4
F =k


##################### STATE OF THE ART TECHNIQUES #################################
##################### BASIC METHODS + SOME ADVANCED METHODS ########################

source('Comparisonx.R')
Comparisonx()

source('ComparisionFLX.R')
ComparisionFLX()


############################# PARAMETERS for GIBB's SAMPLING ####
iter = params$iter
iter.burnin = params$iter.burnin
iter.thin  = params$iter.thin
Nps = as.integer(iter/ iter.thin)


######################### Initialize the Parameters ##############################
source('initialize.R')
initialize()

###################### Start with a good configuration ###########################
#source('startSBC.R')
#startSBC()


########### Train the Model #########################################
source('burninDPMM.R')
burninDPMM()

source('gibbsDPMM.R')
gibbsDPMM()

########## Analyze the fit ##########################################
### Good feature selection from heatmap plus cindex plus randindex
source('MCMCanalyze.R')
MCMCanalyze()

print(paste0("Time (hours) necessary for burnin :",as.character(round(sum(burnintime)/3600),digits=5)))
## [1] "Time (hours) necessary for burnin :1"
print(paste0("Time (hours) necessary for Gibb's sampling :",as.character(round(sum(gibbstime/3600),digits=5))))
## [1] "Time (hours) necessary for Gibb's sampling :0.69124"

Plots of results from training


###############################
### Some plots and analysis ####
#################################
#### Generating some plots and results ON Training data ###
logrank <- survdiff(smod ~ c.final)
pval<-1 - pchisq(unlist(logrank)$chisq,df =3)
surv.fit <- survfit(smod ~ c.final)
p4 <- ggsurv(surv.fit,surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Training Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval,digits = 10))))
p4

png(file="HSIC-Lasso-Regression_files/KMC_Training.pdf")
p4
dev.off()
## quartz_off_screen 
##                 2

############ Generating some Plots ##########################
Y<-Y.pre.train[,signature.sbc]
pc <- prcomp(Y)
pc.pred <- predict(pc,newdata = Y)
p5 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.final))) + ggtitle("PCA after SBC Clustering \n Training Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#C70039"))
p5

png(file="HSIC-Lasso-Regression_files/PCA_Train.pdf")
p5
dev.off()
## quartz_off_screen 
##                 2

to_plot<-as.data.frame(likli.burnin)
p7<-qplot(seq_along(to_plot$likli.burnin), to_plot$likli.burnin)
p7

png(file="HSIC-Lasso-Regression_files/likburn_Train.pdf")
p7
dev.off()
## quartz_off_screen 
##                 2

Testing

#Defining again Y and Y.new
Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]

Y<-scale(Y, scale=TRUE,center= TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)

#PCA
Y.big.unscaled<-rbind(Y,Y.new)
pc.unscaled <- prcomp(Y.big.unscaled,center=TRUE,scale = TRUE)
labels<-c.sbc
for (i in 1:dim(Y.new)[1]){labels<-append(labels,5)}
p8 <- ggplot(as.data.frame(pc.unscaled$x), aes(x=pc.unscaled$x[,1], y= pc.unscaled$x[,2], colour= as.factor(labels))) + ggtitle("PCA \n Training set with SBC labels and unlabeled Validation set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#BDBDBF"))
p8

######## Predict on New Data Set  BASED ON JUST THE MOLECULAR DATA #####################################
smod.new <- Surv(time.new, censoring.new)
source('predictCLASS.R')
#What does this function do?: Outputs a matrix that has dimension number of test samples * number of MCMC samples
#The output matrix is c.matrix.new
predictCLASS(Y.new)
## Check the predicted Rand Index

#### Choose that configuration which has the highest difference in survival curves ####################
lr <- c(0)
for (j in 1:Nps){
  lr[j] <-  1 - pchisq(unlist(survdiff(smod.new ~ c.matrix.new[,j]))$chisq,df=length(table(c.matrix.new[,j]))-1)
}

significant<-c()
for (i in 1:Nps){
  if(lr[i]<0.05 & length(table(c.matrix.new[,i]))==4){
    significant<-append(significant,i)
  }
}

if (length(significant)>0){
  index<-significant[which.min(lr[significant])]
  c.sbc.new <- c.matrix.new[,index]

}else{
 print("There was no MCMC iteration that found 4 clusters and was statistically significant (p_value=0.05)")
 c.sbc.new <- c.matrix.new[,which.min(lr)]
}

###############################################################################
################################# Other ways of obtaining predictions #########
## If we fit cluster-specific models ###########################################

pre.sbc <- c(0)
for (q in unique(c.sbc.new)){
  ind <- which(c.sbc == q)
  ind.new <- which(c.sbc.new == q)

  time.tmp <- time[ind]
  time.tmp.new<- time.new[ind.new]

  Y.tmp <- Y[ind,]
  Y.tmp.new <- Y.new[ind.new,]

  reg <- cv.glmnet(x = Y.tmp, y = time.tmp, family = "gaussian")
  
  if (class(Y.tmp.new)=="numeric"){ 
    pre.sbc[ind.new] <- predict(object = reg, newx = t(matrix(Y.tmp.new)), s = "lambda.min")
  }else{
    pre.sbc[ind.new] <- predict(object = reg, newx = Y.tmp.new, s = "lambda.min")
  }
}
predCIndex.sbc.aft  <<- as.numeric(concordance(smod.new ~ exp(pre.sbc))[1])



#### Use the adhoc-prediction algorithm ####################
source('predictADHOCverhaak.R')

source('predictTIME.R')
predictchineseAFTtime(Y.new)

Plot results from testing

logrank.new <- survdiff(smod.new ~ c.sbc.new)
#df= Degrees of freedom should be number of clusters-1
pval.new<-1 - pchisq(unlist(logrank.new)$chisq,df =length(logrank.new$n)-1)
surv.fit <- survfit(smod.new ~ c.sbc.new)
p5.new <- ggsurv(surv.fit, surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Validation Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval.new,digits = 5))))

par(mfrow=c(2,1))
p4

p5.new

png(file="HSIC-Lasso-Regression_files/KMC_Test.pdf")
p5.new
dev.off()
## quartz_off_screen 
##                 2

pc <- prcomp(Y.new,center=TRUE,scale = TRUE)
pc.pred <- predict(pc,newdata = Y.new)
p4.new <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.sbc.new))) + ggtitle(" PCA after SBC Clustering \n Validation set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes") +scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"))
p4.new

png(file="HSIC-Lasso-Regression_files/PCA_Test.pdf")
p4.new
dev.off()
## quartz_off_screen 
##                 2

pdf(file="HSIC-Lasso-Regression_files/PosteriorProbability.pdf")
heatmapdata <- as.data.frame(final.betahat)
heatmap.2(t(as.matrix(heatmapdata)),dendrogram="none", margins=c(6,10), main = "Posterior probability \n for Selection \n ", cexCol = 0.85, cexRow = 0.7, Rowv = FALSE)
dev.off()
## quartz_off_screen 
##                 2

table(c.sbc) %>%
  kable(caption = "Clusters for the training set") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")

Clusters for the training set
c.sbc	Freq
1	21
2	14
3	10
4	115

table(c.sbc.new) %>%
  kable(caption = "Clusters for the testing set") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")

Clusters for the testing set
c.sbc.new	Freq
1	32
2	10
3	5
4	214

Methods<-c("Flexmix","K-Means + Penalized AFT","K-Means + Penalized Cox","AFT","Cox","SBC")
recovCIndex<-round(c(recovCIndex.flx,recovCIndex.km.paft,recovCIndex.km.pcox,recovCIndex.na.aft,recovCIndex.na.cox,max(recovCIndex.sbc)),digits = 5)
predCIndex<-c(predCIndex.flx,"NA",predCIndex.kk.pcox,predCIndex.na.aft,predCIndex.na.cox,max(predCIndex.sbc))
sbc_and_comparisons<-as.data.frame(Methods)
sbc_and_comparisons$recovCIndex<-recovCIndex
sbc_and_comparisons$predCIndex<-predCIndex
sbc_and_comparisons %>%
  kable(caption = "SBC vs other methods") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")

SBC vs other methods
Methods	recovCIndex	predCIndex
Flexmix	0.65569	0.481911338010064
K-Means + Penalized AFT	0.74191	NA
K-Means + Penalized Cox	0.76896	0.506041796745877
AFT	0.72525	0.514048554743453
Cox	0.73271	0.515811510632828
SBC	0.69753	0.564898813677599

SBC on the GBM dataset from the whole Firebrowse Database(TCGA).

HSIC-Lasso-Regression

Camila Duitama (Based on Ashar Ahmad’s code for the SBC model)

11th July 2019