The Oncogenic Gene Set was downloaded from MSigDB. A penalized AFT model was fit on the training data, and the linear predictors were then used as input features for the SBC model. This model (which was fit on the training data set) was then used to obtain linear predictor on the test set, which would be the test input features for the SBC.
Libraries needed to run the code must be loaded first
Data is separated into training and testing
######## Input ###############
Clinical_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/Clinical_TrainingSet.csv")
mRNAArray_TrainingSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TrainingSet/mRNAArray_TrainingSet.csv")
Clinical_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/Clinical_TestSet.csv")
mRNAArray_TestSet <- read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Second_data_split/TestSet/mRNAArray_TestSet.csv")
indexes.train<-which(is.na(Clinical_TrainingSet$days_to_death), arr.ind=TRUE)
indexes.test<-which(is.na(Clinical_TestSet$days_to_death), arr.ind=TRUE)
training_time<-Clinical_TrainingSet$days_to_death
test_time<-Clinical_TestSet$days_to_death
for (i in indexes.train){
training_time[i]<-Clinical_TrainingSet$days_to_last_followup[i]
}
for (i in indexes.test){
test_time[i]<-Clinical_TestSet$days_to_last_followup[i]
}
total_time<-append(training_time,test_time)
n<-length(Clinical_TrainingSet$X1)
n.new<-length(Clinical_TestSet$X1)
#### Overall Survival Time Vector (N*1)
time<-total_time[1:n]
time<-log(time)
########### Event or Not Vector (N * 1) ####
censoring<-Clinical_TrainingSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TrainingSet$X1<-NULL
Y.pre.train<-mRNAArray_TrainingSet
#### Overall Survival Time Vector (N*1)
time.new<-total_time[-(1:n)]
time.new<-log(time.new)
########### Event or Not Vector (N * 1) ####
censoring.new<-Clinical_TestSet$vital_status
######### mRNA (or miRNA expression values) (N*D) ###
mRNAArray_TestSet$X1<-NULL
Y.pre.test<-mRNAArray_TestSet
print(paste0("Number of patients in training set: ",as.character(n)))
## [1] "Number of patients in training set: 160"
print(paste0("Number of patients in test set: ",as.character(n.new)))
## [1] "Number of patients in test set: 261"
gs<-list()
reg.paft<-list()
linear.aft<-list()
lost<-list()
db_genesets<-msigdbr(species="Homo sapiens",category = "C6")
smod <- Surv(exp(time), censoring)
for (i in unique(db_genesets$gs_name)){
gs[i]<-list(unique(filter(db_genesets,db_genesets$gs_name==i)$gene_symbol))
#Select all columns that aren't in the original dataframe Y.pre.train
tokeep <- gs[[i]][(gs[[i]] %in% colnames(Y.pre.train))]
#list with the percentages of ids lost
lost[[i]]<-(1-(length(tokeep)/length(gs[[i]])))*100
df<-Y.pre.train[,tokeep]
reg.paft[[i]] <- glmnet(x = as.matrix(df), y = time, family = "gaussian")
linear.aft[[i]] <- predict(object = reg.paft[[i]], newx = as.matrix(df), s = min(reg.paft[[i]]$lambda))
}
toplot<-as.data.frame(x=unlist(lost))
colnames(toplot)<-"Percentage"
ggplot(toplot, aes(x=" ", y=Percentage)) +
geom_boxplot(fill="slateblue", alpha=0.2) + labs(title = "Percentage of lost ids per gene set", subtitle =" Training Set")
Y.pre.train<-do.call(cbind,linear.aft)
colnames(Y.pre.train)<-names(linear.aft)
toplot.new<-as.data.frame(x=unlist(lost.new))
colnames(toplot.new)<-"Percentage"
ggplot(toplot, aes(x=" ", y=Percentage)) +
geom_boxplot(fill="slateblue", alpha=0.2) + labs(title = "Percentage of lost ids per gene set", subtitle =" Test Set")
Y.pre.test<-do.call(cbind,linear.aft.new)
colnames(Y.pre.test)<-names(linear.aft.new)
table(censoring) %>%
kable(caption = "Censoring frequency for the training set") %>%
kable_styling(bootstrap_options = c("striped", "hover","responsive"),full_width = T, position = "center")
censoring | Freq |
---|---|
0 | 6 |
1 | 154 |
table(censoring.new) %>%
kable(caption = "Censoring frequency for the testing set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
censoring.new | Freq |
---|---|
0 | 48 |
1 | 213 |
Y.pre.train[1:5,1:5] %>%
kable(caption = "mRNA MicroArray data for the Training set " ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
AKT_UP_MTOR_DN.V1_DN | AKT_UP_MTOR_DN.V1_UP | AKT_UP.V1_DN | AKT_UP.V1_UP | ALK_DN.V1_DN |
---|---|---|---|---|
5.134619 | 5.010416 | 4.860404 | 5.323247 | 5.104891 |
6.574591 | 6.908797 | 6.681042 | 6.347407 | 6.625620 |
5.710860 | 5.806477 | 5.780028 | 5.817500 | 5.749574 |
7.202871 | 6.244260 | 7.058306 | 6.572306 | 6.849285 |
6.331762 | 6.523214 | 6.422343 | 6.244082 | 6.449727 |
Y.pre.test[1:5,1:5] %>%
kable(caption = "mRNA MicroArray data for the Testing set " ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")
AKT_UP_MTOR_DN.V1_DN | AKT_UP_MTOR_DN.V1_UP | AKT_UP.V1_DN | AKT_UP.V1_UP | ALK_DN.V1_DN |
---|---|---|---|---|
6.876991 | 3.890184 | 2.847315 | 6.403560 | 7.5767734 |
4.518973 | 4.091609 | 5.642734 | 4.585454 | -0.3478366 |
10.225450 | 7.979294 | 5.732973 | 3.070294 | 4.0343758 |
6.799209 | 2.669068 | 3.329664 | 6.904838 | 2.5633505 |
6.341744 | 6.750768 | 7.398075 | 6.491614 | 1.7388585 |
Y.pre.train<-do.call(cbind,linear.aft)
colnames(Y.pre.train)<-names(linear.aft)
Y.pre.test<-do.call(cbind,linear.aft.new)
colnames(Y.pre.test)<-names(linear.aft.new)
Y.pre.train<-scale(Y.pre.train,scale=TRUE,center=TRUE)
Y.pre.test<-scale(Y.pre.test,scale=TRUE,center=TRUE)
totalmRNA<-rbind(Y.pre.train,Y.pre.test)
mRNA.pca<-as.data.frame(prcomp(totalmRNA,scale=TRUE, center= TRUE)$x[,1:2])
classes<-c()
for (i in 1:length(Clinical_TrainingSet$days_to_death)){
classes<-append(classes,"Training")
}
for (i in 1:length(Clinical_TestSet$days_to_death)){
classes<-append(classes,"Validation")
}
mRNA.pca$Classes<-classes
p1 <- ggplot(mRNA.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n GBM-TCGA Data set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
p1
pdf(file="OncogenicGeneSets+PAFT_files/PCA_Train&Test.pdf")
p1
dev.off()
## quartz_off_screen
## 2
This chunk calculates SBC gene signature It’s based on the idea of Univariate testing of Survival Data features
######## Prefiltering of the Genes ############################### ###########################
######## Using Univariate Cox's Model for Ranking of the Survival Power of the Genes ########
surv.obj <- Surv(time,censoring)
coeff.sig <- c(0)
pvalue.sig <- c(0)
calcCox = function(x){
q1 <- unlist(summary(coxph(surv.obj ~ ., data = as.data.frame(x))))
return(q1$logtest.pvalue)
}
pvalue.sig <- apply(Y.pre.train,2,calcCox)
###### Adjusting p-values for Multiple Test Correction
pvalue.sig.adj <- p.adjust(pvalue.sig, method = "fdr")
#### As the number of features are quite variable choose first a very loose cut-off
signature.loose <- colnames(Y.pre.train)[(pvalue.sig.adj < params$pval.sbc)]
### Combined the P-values
pvalue.combined <- (pvalue.sig.adj)
names(pvalue.combined) <- colnames(Y.pre.train)
## Sort it
pvalue.combined.sort <- sort(pvalue.combined)
## Only select those genes which are loosely in the signature
pvalue.combined.adj <- pvalue.combined.sort[names(pvalue.combined.sort) %in% signature.loose]
### Take the top 50 genes ####
signature.sbc <- names(pvalue.combined.adj[1:50])
The SBC signature on the dataset looks like this
to_plot<-data.frame(Genes=signature.sbc)
to_plot%>%
kable(caption = "SBC Signature") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = F, position = "center")%>%
scroll_box(width = "500px", height = "200px")
Genes |
---|
CAMP_UP.V1_DN |
DCA_UP.V1_UP |
NFE2L2.V2 |
KRAS.600_UP.V1_DN |
KRAS.600.LUNG.BREAST_UP.V1_DN |
KRAS.600.LUNG.BREAST_UP.V1_UP |
E2F1_UP.V1_DN |
TBK1.DF_UP |
KRAS.600_UP.V1_UP |
TBK1.DF_DN |
IL21_UP.V1_UP |
ESC_J1_UP_LATE.V1_UP |
CYCLIN_D1_KE_.V1_DN |
STK33_UP |
IL21_UP.V1_DN |
ATF2_S_UP.V1_UP |
RAF_UP.V1_UP |
STK33_DN |
IL15_UP.V1_DN |
LTE2_UP.V1_DN |
STK33_NOMO_UP |
PRC1_BMI_UP.V1_DN |
LEF1_UP.V1_UP |
GCNP_SHH_UP_LATE.V1_UP |
E2F1_UP.V1_UP |
MTOR_UP.N4.V1_UP |
DCA_UP.V1_DN |
RAF_UP.V1_DN |
VEGF_A_UP.V1_DN |
TGFB_UP.V1_DN |
MTOR_UP.N4.V1_DN |
LTE2_UP.V1_UP |
IL15_UP.V1_UP |
EGFR_UP.V1_DN |
PRC1_BMI_UP.V1_UP |
PRC2_SUZ12_UP.V1_DN |
CYCLIN_D1_UP.V1_DN |
CYCLIN_D1_UP.V1_UP |
AKT_UP.V1_DN |
ESC_J1_UP_LATE.V1_DN |
PRC2_EED_UP.V1_DN |
RAPA_EARLY_UP.V1_DN |
TGFB_UP.V1_UP |
MEK_UP.V1_UP |
STK33_SKM_UP |
NOTCH_DN.V1_DN |
SNF5_DN.V1_UP |
ESC_V6.5_UP_EARLY.V1_DN |
AKT_UP_MTOR_DN.V1_DN |
MEK_UP.V1_DN |
surv.fit <- survfit(Surv(total_time,append(Clinical_TrainingSet$vital_status,Clinical_TestSet$vital_status))~ classes)
p3 <- ggsurv(surv.fit, plot.cens=FALSE,main = " DPMM \n Kaplan Meier curves for training and validation set")+ ggplot2::guides(linetype = FALSE)
p3
pdf(file="OncogenicGeneSets+PAFT_files/KMC_Train&Test.pdf")
p3
dev.off()
## quartz_off_screen
## 2
Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]
Y<-scale(Y,scale=TRUE,center=TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)
mRNA.signature<-rbind(Y,Y.new)
mRNA.signature.pca<-as.data.frame(prcomp(mRNA.signature,scale=TRUE, center= TRUE)$x[,1:2])
mRNA.signature.pca$Classes<-classes
p6 <- ggplot(mRNA.signature.pca, aes(x=PC1, y=PC2, colour=Classes)) + ggtitle(" PCA of the training and validation set \n SBC Signature mRNAArray Data of GBM") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")
p6
pdf(file="OncogenicGeneSets+PAFT_files/PCA_Signature.pdf")
p6
dev.off()
## quartz_off_screen
## 2
########################################################################
###### We prepare the Data Structures Needed for the Running of the SBC ####
#############################################################################
######## Verhaak Signature ###############
Verhaak_gene_signature <-read_csv("/Volumes/GoogleDrive/My Drive/Documents/Life Science Informatics Master/Master thesis/Data/Verhaak_Data/Verhaak_GS_according_to_Training_Set_Split1.csv",
col_names = FALSE, col_types = cols(X1 = col_skip()))
signature.vk<-Verhaak_gene_signature$X2
######## Verhaak Labels #################
labels.vk<-Clinical_TrainingSet$Subtype
#######Getting signature matrix on training and testing######
Y <- Y.pre.train[,signature.sbc]
c.true <- as.factor(labels.vk)
c.true<-unclass(c.true)
Y.new <- Y.pre.test[,signature.sbc]
##############################################################################
############ This file takes in training data for Glioblastoma #################
################################################################################
###### Scaling the data #########
Y <- scale(Y, center = TRUE, scale = TRUE)
######
N <- nrow(Y)
D <- ncol(Y)
smod <- Surv(exp(time), censoring)
##### Initial number of clusters
k =4
F =k
##################### STATE OF THE ART TECHNIQUES #################################
##################### BASIC METHODS + SOME ADVANCED METHODS ########################
source('Comparisonx.R')
Comparisonx()
source('ComparisionFLX.R')
ComparisionFLX()
############################# PARAMETERS for GIBB's SAMPLING ####
iter = params$iter
iter.burnin = params$iter.burnin
iter.thin = params$iter.thin
Nps = as.integer(iter/ iter.thin)
######################### Initialize the Parameters ##############################
source('initialize.R')
initialize()
###################### Start with a good configuration ###########################
#source('startSBC.R')
#startSBC()
########### Train the Model #########################################
source('burninDPMM.R')
burninDPMM()
source('gibbsDPMM.R')
gibbsDPMM()
########## Analyze the fit ##########################################
### Good feature selection from heatmap plus cindex plus randindex
source('MCMCanalyze.R')
MCMCanalyze()
print(paste0("Time (hours) necessary for burnin :",as.character(round(sum(burnintime)/3600),digits=5)))
## [1] "Time (hours) necessary for burnin :1"
print(paste0("Time (hours) necessary for Gibb's sampling :",as.character(round(sum(gibbstime/3600),digits=5))))
## [1] "Time (hours) necessary for Gibb's sampling :0.64175"
###############################
### Some plots and analysis ####
#################################
#### Generating some plots and results ON Training data ###
logrank <- survdiff(smod ~ c.final)
pval<-1 - pchisq(unlist(logrank)$chisq,df =3)
surv.fit <- survfit(smod ~ c.final)
p4 <- ggsurv(surv.fit,surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Training Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval,digits = 10))))
p4
pdf(file="OncogenicGeneSets+PAFT_files/KMC_Training.pdf")
p4
dev.off()
## quartz_off_screen
## 2
############ Generating some Plots ##########################
Y<-Y.pre.train[,signature.sbc]
pc <- prcomp(Y)
pc.pred <- predict(pc,newdata = Y)
p5 <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.final))) + ggtitle("PCA after SBC Clustering \n Training Set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#C70039"))
p5
pdf(file="OncogenicGeneSets+PAFT_files/PCA_Training.pdf")
p5
dev.off()
## quartz_off_screen
## 2
to_plot<-as.data.frame(likli.burnin)
p7<-qplot(seq_along(to_plot$likli.burnin), to_plot$likli.burnin)
p7
pdf(file="OncogenicGeneSets+PAFT_files/LikBurn.pdf")
p7
dev.off()
## quartz_off_screen
## 2
#Defining again Y and Y.new
Y <- Y.pre.train[,signature.sbc]
Y.new <- Y.pre.test[,signature.sbc]
Y<-scale(Y, scale=TRUE,center= TRUE)
Y.new<-scale(Y.new,scale=TRUE,center=TRUE)
#PCA
Y.big.unscaled<-rbind(Y,Y.new)
pc.unscaled <- prcomp(Y.big.unscaled,center=TRUE,scale = TRUE)
labels<-c.sbc
for (i in 1:dim(Y.new)[1]){labels<-append(labels,5)}
p8 <- ggplot(as.data.frame(pc.unscaled$x), aes(x=pc.unscaled$x[,1], y= pc.unscaled$x[,2], colour= as.factor(labels))) + ggtitle("PCA \n Training set with SBC labels and unlabeled test set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes")+scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3","#BDBDBF"))
p8
######## Predict on New Data Set BASED ON JUST THE MOLECULAR DATA #####################################
smod.new <- Surv(time.new, censoring.new)
source('predictCLASS.R')
#What does this function do?: Outputs a matrix that has dimension number of test samples * number of MCMC samples
#The output matrix is c.matrix.new
predictCLASS(Y.new)
## Check the predicted Rand Index
#### Choose that configuration which has the highest difference in survival curves ####################
lr <- c(0)
for (j in 1:Nps){
lr[j] <- 1 - pchisq(unlist(survdiff(smod.new ~ c.matrix.new[,j]))$chisq,df=length(table(c.matrix.new[,j]))-1)
}
significant<-c()
for (i in 1:Nps){
if(lr[i]<0.05 & length(table(c.matrix.new[,i]))==4){
significant<-append(significant,i)
}
}
if (length(significant)>0){
index<-significant[which.min(lr[significant])]
c.sbc.new <- c.matrix.new[,index]
}else{
print("There was no MCMC iteration that found 4 clusters and was statistically significant (p_value=0.05)")
c.sbc.new <- c.matrix.new[,which.min(lr)]
}
###############################################################################
################################# Other ways of obtaining predictions #########
## If we fit cluster-specific models ###########################################
pre.sbc <- c(0)
for (q in unique(c.sbc.new)){
ind <- which(c.sbc == q)
ind.new <- which(c.sbc.new == q)
time.tmp <- time[ind]
time.tmp.new<- time.new[ind.new]
Y.tmp <- Y[ind,]
Y.tmp.new <- Y.new[ind.new,]
reg <- cv.glmnet(x = Y.tmp, y = time.tmp, family = "gaussian")
if (class(Y.tmp.new)=="numeric"){
pre.sbc[ind.new] <- predict(object = reg, newx = t(matrix(Y.tmp.new)), s = "lambda.min")
}else{
pre.sbc[ind.new] <- predict(object = reg, newx = Y.tmp.new, s = "lambda.min")
}
}
predCIndex.sbc.aft <<- as.numeric(concordance(smod.new ~ exp(pre.sbc))[1])
#### Use the adhoc-prediction algorithm ####################
source('predictADHOCverhaak.R')
source('predictTIME.R')
predictchineseAFTtime(Y.new)
logrank.new <- survdiff(smod.new ~ c.sbc.new)
#df= Degrees of freedom should be number of clusters-1
pval.new<-1 - pchisq(unlist(logrank.new)$chisq,df =length(logrank.new$n)-1)
surv.fit <- survfit(smod.new ~ c.sbc.new)
p5.new <- ggsurv(surv.fit, surv.col=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"),plot.cens=FALSE,main = " DPMM \n Kaplan Meier Estimators \n Validation Set")+ ggplot2::guides(linetype = FALSE)+labs(subtitle = paste("P-value:",toString(round(pval.new,digits = 5))))
par(mfrow=c(2,1))
p4
p5.new
pdf(file="OncogenicGeneSets+PAFT_files/KMC_Test.pdf")
p5.new
dev.off()
## quartz_off_screen
## 2
pc <- prcomp(Y.new,center=TRUE,scale = TRUE)
pc.pred <- predict(pc,newdata = Y.new)
p4.new <- ggplot(as.data.frame(pc.pred), aes(x=pc.pred[,1], y= pc.pred[,2], colour= as.factor(c.sbc.new))) + ggtitle(" PCA after SBC Clustering \n Validation set") + geom_point(shape=19) + labs(y = "PC1", x = "PC2", colour = "Classes") +scale_color_manual(values=c("#b16ed3","#d37f6e","#90d36e","#6ec2d3"))
p4.new
pdf(file="OncogenicGeneSets+PAFT_files/PCA_Test.pdf")
p4.new
dev.off()
## quartz_off_screen
## 2
pdf(file="OncogenicGeneSets+PAFT_files/PosteriorProbability.pdf")
heatmapdata <- as.data.frame(final.betahat)
heatmap.2(t(as.matrix(heatmapdata)),dendrogram="none", margins=c(6,10), main = "Posterior probability \n for Selection \n ", cexCol = 0.85, cexRow = 0.7, Rowv = FALSE)
dev.off()
## quartz_off_screen
## 2
table(c.sbc) %>%
kable(caption = "Clusters for the training set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
c.sbc | Freq |
---|---|
1 | 105 |
2 | 3 |
3 | 28 |
4 | 24 |
table(c.sbc.new) %>%
kable(caption = "Clusters for the testing set") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
c.sbc.new | Freq |
---|---|
1 | 157 |
2 | 2 |
3 | 59 |
4 | 43 |
Methods<-c("Flexmix","K-Means + Penalized AFT","K-Means + Penalized Cox","AFT","Cox","SBC")
recovCIndex<-round(c(recovCIndex.flx,recovCIndex.km.paft,recovCIndex.km.pcox,recovCIndex.na.aft,recovCIndex.na.cox,max(recovCIndex.sbc)),digits = 5)
predCIndex<-c(predCIndex.flx,"NA",predCIndex.kk.pcox,predCIndex.na.aft,predCIndex.na.cox,max(predCIndex.sbc))
sbc_and_comparisons<-as.data.frame(Methods)
sbc_and_comparisons$recovCIndex<-recovCIndex
sbc_and_comparisons$predCIndex<-predCIndex
sbc_and_comparisons %>%
kable(caption = "SBC vs other methods") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),full_width = T, position = "center")
Methods | recovCIndex | predCIndex |
---|---|---|
Flexmix | 0.98398 | 0.877180739706909 |
K-Means + Penalized AFT | 0.98612 | NA |
K-Means + Penalized Cox | 0.93345 | 0.882910346347376 |
AFT | 0.98374 | 0.873985382157417 |
Cox | 0.98414 | 0.874462849377456 |
SBC | 0.95888 | 0.528317478973078 |