HSIC-Lasso feature selection on the mRNA microArray data using censoring data
Libraries needed to run the code must be loaded first
Data is separated into training and testing
######## Input ###############
Clinical_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/Clinical_TrainingSet.csv")
mRNAArray_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/mRNAArray_TrainingSet.csv")
Clinical_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/Clinical_TestSet.csv")
mRNAArray_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/mRNAArray_TestSet.csv")
indexes.train<-which(is.na(Clinical_TrainingSet$days_to_death), arr.ind=TRUE)
indexes.test<-which(is.na(Clinical_TestSet$days_to_death), arr.ind=TRUE)
training_time<-Clinical_TrainingSet$days_to_death
test_time<-Clinical_TestSet$days_to_death
for (i in indexes.train){
training_time[i]<-Clinical_TrainingSet$days_to_last_followup[i]
}
for (i in indexes.test){
test_time[i]<-Clinical_TestSet$days_to_last_followup[i]
}
total_time<-append(training_time,test_time)
n<-length(Clinical_TrainingSet$X1)
n.new<-length(Clinical_TestSet$X1)
#### Overall Survival Time Vector (N*1)
time<-total_time[1:n]
time<-log(time)
########### Event or Not Vector (N * 1) ####
censoring<-Clinical_TrainingSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TrainingSet$X1<-NULL
Y.pre.train<-mRNAArray_TrainingSet
#### Overall Survival Time Vector (N*1)
time.new<-total_time[-(1:n)]
time.new<-log(time.new)
########### Event or Not Vector (N * 1) ####
censoring.new<-Clinical_TestSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TestSet$X1<-NULL
Y.pre.test<-mRNAArray_TestSet
print(paste0("Number of patients in training set: ",as.character(n)))
## [1] "Number of patients in training set: 160"
print(paste0("Number of patients in Validation set: ",as.character(n.new)))
## [1] "Number of patients in Validation set: 261"
HSIC<- read_csv("/Volumes/GoogleDrive/My Drive/Documents/GitHub/MasterThesis/hsic_regression_verhaak_labels.csv",
col_types = cols(X1 = col_skip()))
HSIC<-HSIC$`0`
Y.pre.train<-Y.pre.train[,HSIC]
Y.pre.test<-Y.pre.test[,HSIC]
table(censoring) %>%
kable(caption = "Censoring frequency for the training set") %>%
kable_styling(bootstrap_options = c("striped", "hover","responsive"),full_width = T, position = "center")
censoring | Freq |
---|---|
0 | 6 |
1 | 154 |
table(censoring.new) %>%
kable(caption = "Censoring frequency for the testing set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
censoring.new | Freq |
---|---|
0 | 48 |
1 | 213 |
Y.pre.train[1:5,1:5] %>%
kable(caption = "mRNA MicroArray data for the Training set " ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
DHRS2 | RGS16 | SCGB1A1 | DYNLT3 | H1F0 |
---|---|---|---|---|
-0.5017290 | -0.6184856 | 3.3178606 | 0.5223439 | -1.3289853 |
-0.0513965 | -1.6363537 | -0.1685734 | -0.0605536 | 1.0742208 |
1.7270245 | 0.3571045 | -0.0483781 | 0.7516696 | 1.2437621 |
0.1040074 | -1.7638347 | 5.7551348 | -0.8997272 | 0.5488968 |
0.1947272 | 0.0395339 | -0.2403982 | -0.9498692 | -3.3091073 |
Y.pre.test[1:5,1:5] %>%
kable(caption = "mRNA MicroArray data for the Testing set " ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")
DHRS2 | RGS16 | SCGB1A1 | DYNLT3 | H1F0 |
---|---|---|---|---|
1.4784974 | 1.4062304 | -0.0729953 | 0.9441281 | 2.0828987 |
-0.5685385 | 0.4179473 | -0.4277503 | -0.0472059 | 0.5573085 |
-0.1304195 | 0.0586602 | -0.0007057 | 0.8285561 | -0.0179193 |
-0.0522900 | 1.5930622 | -0.1923557 | 1.5438299 | 1.1767911 |
-0.2443608 | 0.9798317 | 0.1303436 | -0.9291382 | -0.2394585 |
Y.pre.train<-scale(Y.pre.train,scale=TRUE,center=TRUE)
Y.pre.test<-scale(Y.pre.test,scale=TRUE,center=TRUE)
totalmRNA<-rbind(Y.pre.train,Y.pre.test)
mRNA.pca<-as.data.frame(prcomp(totalmRNA,scale=TRUE, center= TRUE)$x[,1:2])
classes<-c()
for (i in 1:length(Clinical_TrainingSet$days_to_death)){
classes<-append(classes,"Training")
}
for (i in 1:length(Clinical_TestSet$days_to_death)){
classes<-append(classes,"Validation")
}
mRNA.pca$Classes<-classes
p1 <- ggplot(mRNA.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n GBM-TCGA Data set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
p1
png(file="HSIC-Lasso-Regression_files/PCA_Train&Test.pdf")
p1
dev.off()
## quartz_off_screen
## 2
This chunk calculates SBC gene signature It’s based on the idea of Univariate testing of Survival Data features
######## Prefiltering of the Genes ############################### ###########################
######## Using Univariate Cox's Model for Ranking of the Survival Power of the Genes ########
surv.obj <- Surv(time,censoring)
coeff.sig <- c(0)
pvalue.sig <- c(0)
calcCox = function(x){
q1 <- unlist(summary(coxph(surv.obj ~ ., data = as.data.frame(x))))
return(q1$logtest.pvalue)
}
pvalue.sig <- apply(Y.pre.train,2,calcCox)
###### Adjusting p-values for Multiple Test Correction
pvalue.sig.adj <- p.adjust(pvalue.sig, method = "fdr")
#### As the number of features are quite variable choose first a very loose cut-off
signature.loose <- colnames(Y.pre.train)[(pvalue.sig.adj < params$pval.sbc)]
### Combined the P-values
pvalue.combined <- (pvalue.sig.adj)
names(pvalue.combined) <- colnames(Y.pre.train)
## Sort it
pvalue.combined.sort <- sort(pvalue.combined)
## Only select those genes which are loosely in the signature
pvalue.combined.adj <- pvalue.combined.sort[names(pvalue.combined.sort) %in% signature.loose]
### Take the top 50 genes ####
signature.sbc <- names(pvalue.combined.adj[1:50])
The SBC signature on the dataset looks like this
to_plot<-data.frame(Genes=signature.sbc)
to_plot%>%
kable(caption = "SBC Signature") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")%>%
scroll_box(width = "500px", height = "200px")
Genes |
---|
ANKRD26 |
THNSL1 |
NET1 |
REM1 |
HEMK1 |
DHRS2 |
FKBP6 |
PCBP3 |
NSBP1 |
CBLN1 |
SUPV3L1 |
BDH1 |
PHF11 |
EIF4EBP2 |
DYNLT3 |
TAF5 |
HIST3H2A |
SNX10 |
ATF3 |
ZNF37A |
RGS16 |
TRIM48 |
AKAP12 |
EPHA7 |
ECHDC2 |
ZNF205 |
RGR |
MDFI |
CLTCL1 |
DNM2 |
TRIM14 |
RAC3 |
BCAT1 |
H1F0 |
GFRA1 |
GRIN2C |
ARG2 |
DCLRE1B |
RAGE |
KLF13 |
FOXO4 |
MMP13 |
GPR176 |
RUSC2 |
ADARB2 |
ATN1 |
LBP |
HAPLN1 |
HIST1H4E |
PLAT |
surv.fit <- survfit(Surv(total_time,append(Clinical_TrainingSet$vital_status,Clinical_TestSet$vital_status))~ classes)
p3 <- ggsurv(surv.fit, plot.cens=FALSE,main = " DPMM \n Kaplan Meier curves for training and validation set")+ ggplot2::guides(linetype = FALSE)
p3
png(file="HSIC-Lasso-Regression_files/KMC_Train&Test.pdf")
p3
dev.off()
## quartz_off_screen
## 2
Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]
Y<-scale(Y,scale=TRUE,center=TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)
mRNA.signature<-rbind(Y,Y.new)
mRNA.signature.pca<-as.data.frame(prcomp(mRNA.signature,scale=TRUE, center= TRUE)$x[,1:2])
mRNA.signature.pca$Classes<-classes
p6 <- ggplot(mRNA.signature.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n SBC Signature mRNAArray Data of GBM") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
p6
png(file="HSIC-Lasso-Regression_files/PCASignature_Train&Test.pdf")
p6
dev.off()
## quartz_off_screen
## 2
########################################################################
###### We prepare the Data Structures Needed for the Running of the SBC ####
#############################################################################
######## Verhaak Signature ###############
Verhaak_gene_signature <-read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Verhaak_Data/Verhaak_GS_according_to_Training_Set_Split1.csv",
col_names = FALSE, col_types = cols(X1 = col_skip()))
signature.vk<-Verhaak_gene_signature$X2
######## Verhaak Labels #################
labels.vk<-Clinical_TrainingSet$Subtype
#######Getting signature matrix on training and testing######
Y <- Y.pre.train[,signature.sbc]
c.true <- as.factor(labels.vk)
c.true<-unclass(c.true)
Y.new <- Y.pre.test[,signature.sbc]
##############################################################################
############ This file takes in training data for Glioblastoma #################
################################################################################
###### Scaling the data #########
Y <- scale(Y, center = TRUE, scale = TRUE)
######
N <- nrow(Y)
D <- ncol(Y)
smod <- Surv(exp(time), censoring)
##### Initial number of clusters
k =4
F =k
##################### STATE OF THE ART TECHNIQUES #################################
##################### BASIC METHODS + SOME ADVANCED METHODS ########################
source('Comparisonx.R')
Comparisonx()
source('ComparisionFLX.R')
ComparisionFLX()
############################# PARAMETERS for GIBB's SAMPLING ####
iter = params$iter
iter.burnin = params$iter.burnin
iter.thin = params$iter.thin
Nps = as.integer(iter/ iter.thin)
######################### Initialize the Parameters ##############################
source('initialize.R')
initialize()
###################### Start with a good configuration ###########################
#source('startSBC.R')
#startSBC()
########### Train the Model #########################################
source('burninDPMM.R')
burninDPMM()
source('gibbsDPMM.R')
gibbsDPMM()
########## Analyze the fit ##########################################
### Good feature selection from heatmap plus cindex plus randindex
source('MCMCanalyze.R')
MCMCanalyze()
print(paste0("Time (hours) necessary for burnin :",as.character(round(sum(burnintime)/3600),digits=5)))
## [1] "Time (hours) necessary for burnin :1"
print(paste0("Time (hours) necessary for Gibb's sampling :",as.character(round(sum(gibbstime/3600),digits=5))))
## [1] "Time (hours) necessary for Gibb's sampling :0.69124"
###############################
### Some plots and analysis ####
#################################
#### Generating some plots and results ON Training data ###
logrank <- survdiff(smod ~ c.final)
pval<-1 - pchisq(unlist(logrank)$chisq,df =3)
surv.fit <- survfit(smod ~ c.final)
p4 <- ggsurv(surv.fit,surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Training Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval,digits = 10))))
p4
png(file="HSIC-Lasso-Regression_files/KMC_Training.pdf")
p4
dev.off()
## quartz_off_screen
## 2
############ Generating some Plots ##########################
Y<-Y.pre.train[,signature.sbc]
pc <- prcomp(Y)
pc.pred <- predict(pc,newdata = Y)
p5 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.final))) + ggtitle("PCA after SBC Clustering \n Training Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#C70039"))
p5
png(file="HSIC-Lasso-Regression_files/PCA_Train.pdf")
p5
dev.off()
## quartz_off_screen
## 2
to_plot<-as.data.frame(likli.burnin)
p7<-qplot(seq_along(to_plot$likli.burnin), to_plot$likli.burnin)
p7
png(file="HSIC-Lasso-Regression_files/likburn_Train.pdf")
p7
dev.off()
## quartz_off_screen
## 2
#Defining again Y and Y.new
Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]
Y<-scale(Y, scale=TRUE,center= TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)
#PCA
Y.big.unscaled<-rbind(Y,Y.new)
pc.unscaled <- prcomp(Y.big.unscaled,center=TRUE,scale = TRUE)
labels<-c.sbc
for (i in 1:dim(Y.new)[1]){labels<-append(labels,5)}
p8 <- ggplot(as.data.frame(pc.unscaled$x), aes(x=pc.unscaled$x[,1], y= pc.unscaled$x[,2], colour= as.factor(labels))) + ggtitle("PCA \n Training set with SBC labels and unlabeled Validation set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#BDBDBF"))
p8
######## Predict on New Data Set BASED ON JUST THE MOLECULAR DATA #####################################
smod.new <- Surv(time.new, censoring.new)
source('predictCLASS.R')
#What does this function do?: Outputs a matrix that has dimension number of test samples * number of MCMC samples
#The output matrix is c.matrix.new
predictCLASS(Y.new)
## Check the predicted Rand Index
#### Choose that configuration which has the highest difference in survival curves ####################
lr <- c(0)
for (j in 1:Nps){
lr[j] <- 1 - pchisq(unlist(survdiff(smod.new ~ c.matrix.new[,j]))$chisq,df=length(table(c.matrix.new[,j]))-1)
}
significant<-c()
for (i in 1:Nps){
if(lr[i]<0.05 & length(table(c.matrix.new[,i]))==4){
significant<-append(significant,i)
}
}
if (length(significant)>0){
index<-significant[which.min(lr[significant])]
c.sbc.new <- c.matrix.new[,index]
}else{
print("There was no MCMC iteration that found 4 clusters and was statistically significant (p_value=0.05)")
c.sbc.new <- c.matrix.new[,which.min(lr)]
}
###############################################################################
################################# Other ways of obtaining predictions #########
## If we fit cluster-specific models ###########################################
pre.sbc <- c(0)
for (q in unique(c.sbc.new)){
ind <- which(c.sbc == q)
ind.new <- which(c.sbc.new == q)
time.tmp <- time[ind]
time.tmp.new<- time.new[ind.new]
Y.tmp <- Y[ind,]
Y.tmp.new <- Y.new[ind.new,]
reg <- cv.glmnet(x = Y.tmp, y = time.tmp, family = "gaussian")
if (class(Y.tmp.new)=="numeric"){
pre.sbc[ind.new] <- predict(object = reg, newx = t(matrix(Y.tmp.new)), s = "lambda.min")
}else{
pre.sbc[ind.new] <- predict(object = reg, newx = Y.tmp.new, s = "lambda.min")
}
}
predCIndex.sbc.aft <<- as.numeric(concordance(smod.new ~ exp(pre.sbc))[1])
#### Use the adhoc-prediction algorithm ####################
source('predictADHOCverhaak.R')
source('predictTIME.R')
predictchineseAFTtime(Y.new)
logrank.new <- survdiff(smod.new ~ c.sbc.new)
#df= Degrees of freedom should be number of clusters-1
pval.new<-1 - pchisq(unlist(logrank.new)$chisq,df =length(logrank.new$n)-1)
surv.fit <- survfit(smod.new ~ c.sbc.new)
p5.new <- ggsurv(surv.fit, surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Validation Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval.new,digits = 5))))
par(mfrow=c(2,1))
p4
p5.new
png(file="HSIC-Lasso-Regression_files/KMC_Test.pdf")
p5.new
dev.off()
## quartz_off_screen
## 2
pc <- prcomp(Y.new,center=TRUE,scale = TRUE)
pc.pred <- predict(pc,newdata = Y.new)
p4.new <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.sbc.new))) + ggtitle(" PCA after SBC Clustering \n Validation set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes") +scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"))
p4.new
png(file="HSIC-Lasso-Regression_files/PCA_Test.pdf")
p4.new
dev.off()
## quartz_off_screen
## 2
pdf(file="HSIC-Lasso-Regression_files/PosteriorProbability.pdf")
heatmapdata <- as.data.frame(final.betahat)
heatmap.2(t(as.matrix(heatmapdata)),dendrogram="none", margins=c(6,10), main = "Posterior probability \n for Selection \n ", cexCol = 0.85, cexRow = 0.7, Rowv = FALSE)
dev.off()
## quartz_off_screen
## 2
table(c.sbc) %>%
kable(caption = "Clusters for the training set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
c.sbc | Freq |
---|---|
1 | 21 |
2 | 14 |
3 | 10 |
4 | 115 |
table(c.sbc.new) %>%
kable(caption = "Clusters for the testing set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
c.sbc.new | Freq |
---|---|
1 | 32 |
2 | 10 |
3 | 5 |
4 | 214 |
Methods<-c("Flexmix","K-Means + Penalized AFT","K-Means + Penalized Cox","AFT","Cox","SBC")
recovCIndex<-round(c(recovCIndex.flx,recovCIndex.km.paft,recovCIndex.km.pcox,recovCIndex.na.aft,recovCIndex.na.cox,max(recovCIndex.sbc)),digits = 5)
predCIndex<-c(predCIndex.flx,"NA",predCIndex.kk.pcox,predCIndex.na.aft,predCIndex.na.cox,max(predCIndex.sbc))
sbc_and_comparisons<-as.data.frame(Methods)
sbc_and_comparisons$recovCIndex<-recovCIndex
sbc_and_comparisons$predCIndex<-predCIndex
sbc_and_comparisons %>%
kable(caption = "SBC vs other methods") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
Methods | recovCIndex | predCIndex |
---|---|---|
Flexmix | 0.65569 | 0.481911338010064 |
K-Means + Penalized AFT | 0.74191 | NA |
K-Means + Penalized Cox | 0.76896 | 0.506041796745877 |
AFT | 0.72525 | 0.514048554743453 |
Cox | 0.73271 | 0.515811510632828 |
SBC | 0.69753 | 0.564898813677599 |