######################################################################## ######################################################################## #The mutation significance cutoff (MSC) project. #A python program for generating a 99% CADD confidence interval per gene #following a normal distribution. #For questions please contact Yuval Itan: yitan@rockefeller.edu ######################################################################## ######################################################################## ###################################################### #################Importing libraries################## ###################################################### import time import sys import os import csv from scipy import stats import numpy import math #################################################################### ################Parameters, structures and functions################ #################################################################### start_time = time.time() file2 = open("CI99_CADD_normal.txt", 'w') #output file name CADD_dic = {} #dictionary of CADD scores per gene count_dic = {} #dictionary of number of mutations in a gene allele_dic = {} #dictionary of number of alleles in a gene convert_dic = {} #dictionary of raw-Phred CADD scores convertion file1 = open("HGMD_CADD_full.txt", 'r') #Input file, tab separated. For each line 1st column for gene name and 2nd for CADD score, and potentially 3rd for number of alleles. temp_count = 0 print "\nExtracting gene scores\n" for line1 in file1: #Going through gene lines line11 = line1.strip() line9 = line11.split("\t") temp1 = line9[0].split('(') temp2 = temp1[0].split(',') temp3 = temp2[0].split(';') gene = temp3[0] #Final gene name CADD = float(line9[1]) #CADD score for mutation N_alleles = 1 #N_alleles = int(line9[2]) #Remove '#' to take into account number of alleles ###Constructing dictionaries### if (gene not in CADD_dic): temp_list = [] temp_list.append(CADD) temp_list = temp_list * N_alleles CADD_dic[gene] = temp_list count_dic[gene] = 1 allele_dic[gene] = N_alleles else: temp_list = [] temp_list.append(CADD) temp_list = temp_list * N_alleles CADD_dic[gene].extend(temp_list) count_dic[gene] = count_dic[gene] + 1 allele_dic[gene] = allele_dic[gene] + N_alleles temp_count = temp_count + 1 file1.close() ###Converting CADD raw to Phred scores### file1 = open("conversion_table_range_v1.3.txt", 'r') for line1 in file1: #go through all patients files line11 = line1.strip() raw,phred = line11.split("\t") phred = ("%.3f" % round(float(phred),3)) convert_dic[raw] = phred def convert(raw): raw = float(("%.3f" % round(float(raw),3))) if (raw < -7.53): raw = -7.53 if (raw > 98): raw = 98 for item in convert_dic: min_raw, max_raw = item.split(":") min_raw = float(min_raw) max_raw = float(max_raw) if (raw >= min_raw and raw < max_raw): phred = ("%.3f" % round(float(convert_dic[item]),3)) return phred return "NA" ############################################################### ################Generating confidence intervals################ ############################################################### counter = 0 header = "Gene\tN_mutations\tMean\tMedian\t99CI_lower\t99CI_upper\tMin\tMax\n" file2.write(header) for gene in CADD_dic: #Applying full mutational information for all genes with variants in patients try: if (count_dic[gene] > 1): #Estimate only for more than one mutation per gene counter = counter + 1 print counter N_array = len(CADD_dic[gene]) N_alleles = allele_dic[gene] median = numpy.median(CADD_dic[gene]) mean = numpy.mean(CADD_dic[gene]) shape = stats.norm.fit(CADD_dic[gene]) #Parameters fitting for normal distribution loc2 = shape[0] std2 = shape[1] if (N_array < 10): #Generating normal distribution CI for a small sample interval99_down = stats.norm.interval(0.99,loc=loc2,scale=std2/math.sqrt(N_array))[0] interval99_up = stats.norm.interval(0.99,loc=loc2,scale=std2/math.sqrt(N_array))[1] else: #Generating normal distribution CI for a regular sample interval99_down = stats.norm.interval(alpha=0.99,loc=loc2,scale=std2)[0] interval99_up = stats.norm.interval(alpha=0.99,loc=loc2,scale=std2)[1] if (interval99_down < -1.91): interval99_down = -1.91 min_CADD = convert(min(CADD_dic[gene])) #Converting CADD raw to Phred max_CADD = convert(max(CADD_dic[gene])) #Converting CADD raw to Phred interval99_down = convert(interval99_down) #Converting CADD raw to Phred interval99_up = convert(interval99_up) #Converting CADD raw to Phred mean = convert(mean) #Converting CADD raw to Phred median = convert(median) #Converting CADD raw to Phred to_write = gene + "\t" + str(count_dic[gene]) + "\t" + str(mean) + "\t" + str(median) + "\t" + str(interval99_down) + "\t" + str(interval99_up) + "\t" + str(min_CADD) + "\t" + str(max_CADD) + "\n" file2.write(to_write) #Writing results to output file except: 0 file2.close()