########################################################################
########################################################################
#The mutation significance cutoff (MSC) project.
#R program for ROC curve comparisons between fixed and MSC-based CADD
#predictions.
#Have in work library the input files below with predicted values for
#FP and TP sets (controls and cases, using values of 0 and 1 in 
#this study to reflect a binary prediction.
#For questions please contact Yuval Itan: yitan@rockefeller.edu
########################################################################
########################################################################

library(pROC)
data(aSAH)

aa11<-read.table("TP_CADD10.txt") #True positive predictions with a fixed CADD=10 cutoff
CADD_10_cases<-matrix(as.numeric(aa11[,1]),nrow=nrow(aa11))
aa12<-read.table("FP_CADD10.txt") #False positive predictions with a fixed CADD=10 cutoff
CADD_10_controls<-matrix(as.numeric(aa12[,1]),nrow=nrow(aa12))

aa21<-read.table("TP_CADD15.txt") #True positive predictions with a fixed CADD=15 cutoff
CADD_15_cases<-matrix(as.numeric(aa21[,1]),nrow=nrow(aa21))
aa22<-read.table("FP_CADD15.txt") #False positive predictions with a fixed CADD=15 cutoff
CADD_15_controls<-matrix(as.numeric(aa22[,1]),nrow=nrow(aa22))

aa31<-read.table("TP_CADD20.txt") #True positive predictions with a fixed CADD=20 cutoff
CADD_20_cases<-matrix(as.numeric(aa31[,1]),nrow=nrow(aa31))
aa32<-read.table("FP_CADD20.txt") #False positive predictions with a fixed CADD=20 cutoff
CADD_20_controls<-matrix(as.numeric(aa32[,1]),nrow=nrow(aa32))

aa41<-read.table("TP_MSC-CADD90.txt") #True positive predictions with a CADD-based MSC generated with a 90% CI
CADD_MSC_90_cases<-matrix(as.numeric(aa41[,1]),nrow=nrow(aa41))
aa42<-read.table("FP_MSC-CADD90.txt") #False positive predictions with a CADD-based MSC generated with a 90% CI
CADD_MSC_90_controls<-matrix(as.numeric(aa42[,1]),nrow=nrow(aa42))

aa51<-read.table("TP_MSC-CADD95.txt") #True positive predictions with a CADD-based MSC generated with a 95% CI
CADD_MSC_95_cases<-matrix(as.numeric(aa51[,1]),nrow=nrow(aa51))
aa52<-read.table("FP_MSC-CADD99.txt") #False positive predictions with a CADD-based MSC generated with a 95% CI
CADD_MSC_95_controls<-matrix(as.numeric(aa52[,1]),nrow=nrow(aa52))

aa61<-read.table("TP_MSC-CADD99.txt") #True positive predictions with a CADD-based MSC generated with a 99% CI
CADD_MSC_99_cases<-matrix(as.numeric(aa61[,1]),nrow=nrow(aa61))
aa62<-read.table("FP_MSC-CADD99.txt") #False positive predictions with a CADD-based MSC generated with a 99% CI
CADD_MSC_99_controls<-matrix(as.numeric(aa62[,1]),nrow=nrow(aa62))

pdf("ROC_CADD.pdf", width = 8, height = 8) #Figure file

CADD_MSC_90_roc = roc(controls=CADD_MSC_90_controls, cases=CADD_MSC_90_cases) #Generating ROC curve by FP and TP predictions
test4 = smooth(CADD_MSC_90_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj4 <- plot(test4, percent=TRUE, col="darkgreen") #Plotting ROC curve
auc(test4) #Printing area under curve (AUC) for the ROC curve

CADD_MSC_95_roc = roc(controls=CADD_MSC_95_controls, cases=CADD_MSC_95_cases) #Generating ROC curve by FP and TP predictions
test5 = smooth(CADD_MSC_95_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj5 <- lines(test5, percent=TRUE, col="grey50") #Plotting ROC curve
auc(test5) #Printing area under curve (AUC) for the ROC curve

CADD_MSC_99_roc = roc(controls=CADD_MSC_99_controls, cases=CADD_MSC_99_cases) #Generating ROC curve by FP and TP predictions
test6 = smooth(CADD_MSC_99_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj6 <- lines(test6, percent=TRUE, col="dodgerblue2") #Plotting ROC curve
auc(test6) #Printing area under curve (AUC) for the ROC curve

CADD_20_roc = roc(controls=CADD_20_controls, cases=CADD_20_cases) #Generating ROC curve by FP and TP predictions
test3 = smooth(CADD_20_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj3 <- lines(test3, percent=TRUE, col="tan2") #Plotting ROC curve
auc(test3) #Printing area under curve (AUC) for the ROC curve

CADD_15_roc = roc(controls=CADD_15_controls, cases=CADD_15_cases) #Generating ROC curve by FP and TP predictions
test2 = smooth(CADD_15_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj2 <- lines(test2, percent=TRUE, col="tomato4") #Plotting ROC curve
auc(test2) #Printing area under curve (AUC) for the ROC curve

CADD_10_roc = roc(controls=CADD_10_controls, cases=CADD_10_cases) #Generating ROC curve by FP and TP predictions
test1 = smooth(CADD_10_roc, method = "fitdistr") #Smoothing ROC curve due to binary predictions (can be potentially skipped)
rocobj1 <- lines(test1, percent=TRUE, col="red") #Plotting ROC curve
auc(test1) #Printing area under curve (AUC) for the ROC curve

###Bootstrapping simulations for p-values between different curves, remove '#' to activate.
#testobj <- roc.test(test1, test2, method="bootstrap", boot.n=1000)
#testobj$p.value
#testobj <- roc.test(test1, test3, method="bootstrap", boot.n=1000)
#testobj$p.value
#testobj <- roc.test(test1, test4, method="bootstrap", boot.n=1000)
#testobj$p.value
#testobj <- roc.test(test1, test5, method="bootstrap", boot.n=1000)
#testobj$p.value
#testobj <- roc.test(test1, test6, method="bootstrap", boot.n=1000)
#testobj$p.value

legend("bottomright", legend=c("MSC-CADD 90% CI", "MSC-CADD 95% CI", "MSC-CADD 99% CI", "CADD=20", "CADD=15", "CADD=10"), col=c("darkgreen", "grey50", "dodgerblue2","tan2", "tomato4", "red"), lwd=3, cex=1.5) #Plotting legened

dev.off()