###################################################### #################Importing libraries################## ###################################################### import time import sys import os import csv from scipy import stats import numpy import math ###################################################### #############Parameters and structures################ ###################################################### start_time = time.time() file2 = open("GDI_cutoffs_output.txt", 'w') CADD_dic = {} count_dic = {} file1 = open("PID_genes_AR_GDI.txt", 'r') controls_temp_gene_dic = {} #storing gene damage for a single patient temp_count = 0 #print "\nExtracting from controls\n" for line1 in file1: #go through all patients files line11 = line1.strip() gene = 'AA' CADD = float(line11) N_alleles = 1 if (gene not in CADD_dic): temp_list = [] temp_list.append(CADD) temp_list = temp_list * N_alleles CADD_dic[gene] = temp_list count_dic[gene] = 1 else: temp_list = [] temp_list.append(CADD) temp_list = temp_list * N_alleles CADD_dic[gene].extend(temp_list) count_dic[gene] = count_dic[gene] + 1 temp_count = temp_count + 1 file1.close() ###################################################### #################Getting patients data################ ###################################################### header = "N_values\tMean\tMedian\t95CI_lower\t95CI_upper\tMin\tMax\n" file2.write(header) for gene in CADD_dic: #Applying full mutational information for all genes with variants in patients try: N_array = len(CADD_dic[gene]) n, min_max, mean, var, skew, kurt = stats.describe(CADD_dic[gene]) median = numpy.median(CADD_dic[gene]) std=math.sqrt(var) if (N_array < 10 and N_array > 1): interval95_down = stats.norm.interval(0.95,loc=mean,scale=std/math.sqrt(N_array))[0] interval95_up = stats.norm.interval(0.95,loc=mean,scale=std/math.sqrt(N_array))[1] else: interval95_down = stats.norm.interval(0.95,loc=mean,scale=std)[0] interval95_up = stats.norm.interval(0.95,loc=mean,scale=std)[1] min_CADD = min(CADD_dic[gene]) max_CADD = max(CADD_dic[gene]) if (N_array == 1): mean = float(CADD_dic[gene][0]) median = float(CADD_dic[gene][0]) interval95_down = mean min_CADD = interval95_down interval95_up = mean max_CADD = interval95_up except: #print CADD_dic[gene] mean = float(CADD_dic[gene][0]) median = float(CADD_dic[gene][0]) interval95_down = mean interval95_up = mean min_CADD = interval95_down max_CADD = interval95_up print "Proposed GDI cutoff by upper boundary of 95% CI:\t" + str(interval95_up) #print interval95_down lst_sorted = sorted(CADD_dic[gene]) temp_index = int(round(float(len(CADD_dic[gene]))/100.0)) #print lst_sorted[temp_index] to_write = str(count_dic[gene]) + "\t" + str(mean) + "\t" + str(median) + "\t" + str(interval95_down) + "\t" + str(interval95_up) + "\t" + str(min_CADD) + "\t" + str(max_CADD) + "\n" file2.write(to_write) file2.close()