comparison CloudMap_InSilico.py @ 0:dff952ff16d9 draft

Uploaded
author gregory-minevich
date Thu, 01 Nov 2012 19:25:47 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dff952ff16d9
1 #!/usr/bin/python
2
3 import sys
4 import optparse
5 import csv
6 import operator
7 import pprint
8
9 def main():
10 parser = optparse.OptionParser()
11 parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files)
12 parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names)
13 parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string")
14 parser.add_option("-o", dest = "data_output_file", action = "store", type = "string")
15 (options, args) = parser.parse_args()
16
17 #get the counts
18 gene_count = {}
19 for input_file in options.input_files:
20 gene_count = summarize_counts(input_file = input_file, gene_count = gene_count)
21
22 #delete the single counts
23 for k, v in gene_count.items():
24 if v == 1:
25 del gene_count[k]
26
27 # we do two passes because we don't know the order yet and don't want to hold onto everything in memory
28
29 file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names)
30
31 #get snpeff lines
32 gene_data = {}
33 all_headers = []
34 for input_file in options.input_files:
35 headers = []
36 headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary)
37
38 if len(all_headers) == 0 :
39 all_headers.extend(headers)
40
41 sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1))
42 sorted_gene_count_list.reverse()
43
44 print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file)
45 print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers)
46
47 def create_file_sample_dictionary(input_files = None, sample_names = None):
48 file_sample_dictionary = {}
49
50 for i in range(0, len(input_files)):
51 if i < len(sample_names):
52 file_sample_dictionary[input_files[i]] = sample_names[i]
53 else:
54 file_sample_dictionary[input_files[i]] = ""
55
56 return file_sample_dictionary
57
58 def split_input_files(option, opt_str, value, parser):
59 input_files = []
60 collect = True
61 for i in range(0, len(parser.rargs)):
62 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]:
63 collect = False
64 elif collect == True:
65 input_files.append(parser.rargs[i])
66
67 setattr(parser.values, option.dest, input_files)
68
69 def split_sample_names(option, opt_str, value, parser):
70 input_files = []
71 collect = True
72 for i in range(0, len(parser.rargs)):
73 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]:
74 collect = False
75 elif collect == True:
76 input_files.append(parser.rargs[i])
77
78 setattr(parser.values, option.dest, input_files)
79
80
81 def print_summary(gene_count_list = None, summary_output_file = ""):
82 f = open(summary_output_file, 'w')
83 f.write('\n'.join('%s\t%s' % g for g in gene_count_list))
84 f.close()
85
86 def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None):
87 f = open(data_output_file, 'w')
88
89 for header in all_headers:
90 f.write('Sample Name')
91 f.write('\t')
92 f.write(('\t').join(header))
93 f.write('\n')
94
95 for i in range(0, len(gene_count_list)):
96 gene_id = gene_count_list[i][0]
97 f.write(gene_data[gene_id] + "\n")
98
99 f.close()
100
101 def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None):
102 i_file = open(input_file, 'rU')
103 reader = csv.reader(i_file, delimiter = '\t')
104 headers = skip_and_collect_headers(reader = reader, i_file = i_file)
105
106 sample_name = file_sample_dictionary[input_file]
107
108 for row in reader:
109 gene_id = row[9].upper()
110 if gene_id in gene_count:
111 if gene_id in gene_data:
112 gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip()
113 else:
114 gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip()
115
116 return headers, gene_data
117
118 def summarize_counts(input_file= "", gene_count = {}):
119 i_file = open(input_file, 'rU')
120 reader = csv.reader(i_file, delimiter = '\t')
121 skip_and_collect_headers(reader = reader, i_file = i_file)
122
123 counted_genes = []
124
125 for row in reader:
126 gene_id = row[9].upper()
127 if gene_id not in counted_genes:
128 if gene_id in gene_count and gene_id not in counted_genes:
129 gene_count[gene_id] += 1
130 else:
131 gene_count[gene_id] = 1
132
133 counted_genes.append(gene_id)
134
135 return gene_count
136
137 def skip_and_collect_headers(reader = None, i_file = None):
138 headers = []
139
140 # count headers
141 comment = 0
142 while True:
143 row = reader.next()
144 if len(row) == 0:
145 comment = comment + 1
146 elif row[0].startswith('#'):
147 comment = comment + 1
148 if comment == 3:
149 headers.append(row)
150 else:
151 break
152
153 # skip headers
154 i_file.seek(0)
155 for i in range(0, comment):
156 reader.next()
157
158 return headers
159
160 if __name__ == "__main__":
161 main()