annotate CloudMap_InSilico.py @ 0:dff952ff16d9 draft

Uploaded
author gregory-minevich
date Thu, 01 Nov 2012 19:25:47 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
1 #!/usr/bin/python
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
2
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
3 import sys
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
4 import optparse
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
5 import csv
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
6 import operator
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
7 import pprint
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
8
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
9 def main():
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
10 parser = optparse.OptionParser()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
11 parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
12 parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
13 parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string")
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
14 parser.add_option("-o", dest = "data_output_file", action = "store", type = "string")
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
15 (options, args) = parser.parse_args()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
16
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
17 #get the counts
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
18 gene_count = {}
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
19 for input_file in options.input_files:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
20 gene_count = summarize_counts(input_file = input_file, gene_count = gene_count)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
21
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
22 #delete the single counts
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
23 for k, v in gene_count.items():
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
24 if v == 1:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
25 del gene_count[k]
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
26
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
27 # we do two passes because we don't know the order yet and don't want to hold onto everything in memory
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
28
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
29 file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
30
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
31 #get snpeff lines
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
32 gene_data = {}
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
33 all_headers = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
34 for input_file in options.input_files:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
35 headers = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
36 headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
37
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
38 if len(all_headers) == 0 :
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
39 all_headers.extend(headers)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
40
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
41 sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1))
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
42 sorted_gene_count_list.reverse()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
43
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
44 print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
45 print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
46
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
47 def create_file_sample_dictionary(input_files = None, sample_names = None):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
48 file_sample_dictionary = {}
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
49
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
50 for i in range(0, len(input_files)):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
51 if i < len(sample_names):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
52 file_sample_dictionary[input_files[i]] = sample_names[i]
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
53 else:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
54 file_sample_dictionary[input_files[i]] = ""
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
55
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
56 return file_sample_dictionary
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
57
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
58 def split_input_files(option, opt_str, value, parser):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
59 input_files = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
60 collect = True
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
61 for i in range(0, len(parser.rargs)):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
62 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
63 collect = False
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
64 elif collect == True:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
65 input_files.append(parser.rargs[i])
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
66
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
67 setattr(parser.values, option.dest, input_files)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
68
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
69 def split_sample_names(option, opt_str, value, parser):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
70 input_files = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
71 collect = True
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
72 for i in range(0, len(parser.rargs)):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
73 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
74 collect = False
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
75 elif collect == True:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
76 input_files.append(parser.rargs[i])
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
77
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
78 setattr(parser.values, option.dest, input_files)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
79
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
80
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
81 def print_summary(gene_count_list = None, summary_output_file = ""):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
82 f = open(summary_output_file, 'w')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
83 f.write('\n'.join('%s\t%s' % g for g in gene_count_list))
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
84 f.close()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
85
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
86 def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
87 f = open(data_output_file, 'w')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
88
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
89 for header in all_headers:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
90 f.write('Sample Name')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
91 f.write('\t')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
92 f.write(('\t').join(header))
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
93 f.write('\n')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
94
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
95 for i in range(0, len(gene_count_list)):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
96 gene_id = gene_count_list[i][0]
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
97 f.write(gene_data[gene_id] + "\n")
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
98
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
99 f.close()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
100
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
101 def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
102 i_file = open(input_file, 'rU')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
103 reader = csv.reader(i_file, delimiter = '\t')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
104 headers = skip_and_collect_headers(reader = reader, i_file = i_file)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
105
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
106 sample_name = file_sample_dictionary[input_file]
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
107
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
108 for row in reader:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
109 gene_id = row[9].upper()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
110 if gene_id in gene_count:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
111 if gene_id in gene_data:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
112 gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
113 else:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
114 gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
115
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
116 return headers, gene_data
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
117
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
118 def summarize_counts(input_file= "", gene_count = {}):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
119 i_file = open(input_file, 'rU')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
120 reader = csv.reader(i_file, delimiter = '\t')
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
121 skip_and_collect_headers(reader = reader, i_file = i_file)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
122
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
123 counted_genes = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
124
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
125 for row in reader:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
126 gene_id = row[9].upper()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
127 if gene_id not in counted_genes:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
128 if gene_id in gene_count and gene_id not in counted_genes:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
129 gene_count[gene_id] += 1
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
130 else:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
131 gene_count[gene_id] = 1
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
132
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
133 counted_genes.append(gene_id)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
134
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
135 return gene_count
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
136
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
137 def skip_and_collect_headers(reader = None, i_file = None):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
138 headers = []
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
139
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
140 # count headers
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
141 comment = 0
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
142 while True:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
143 row = reader.next()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
144 if len(row) == 0:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
145 comment = comment + 1
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
146 elif row[0].startswith('#'):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
147 comment = comment + 1
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
148 if comment == 3:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
149 headers.append(row)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
150 else:
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
151 break
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
152
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
153 # skip headers
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
154 i_file.seek(0)
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
155 for i in range(0, comment):
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
156 reader.next()
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
157
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
158 return headers
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
159
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
160 if __name__ == "__main__":
dff952ff16d9 Uploaded
gregory-minevich
parents:
diff changeset
161 main()