Mercurial > repos > gregory-minevich > cloudmap_in_silico_complementation
comparison CloudMap_InSilico.py @ 0:dff952ff16d9 draft
Uploaded
author | gregory-minevich |
---|---|
date | Thu, 01 Nov 2012 19:25:47 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dff952ff16d9 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import sys | |
4 import optparse | |
5 import csv | |
6 import operator | |
7 import pprint | |
8 | |
9 def main(): | |
10 parser = optparse.OptionParser() | |
11 parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files) | |
12 parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names) | |
13 parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string") | |
14 parser.add_option("-o", dest = "data_output_file", action = "store", type = "string") | |
15 (options, args) = parser.parse_args() | |
16 | |
17 #get the counts | |
18 gene_count = {} | |
19 for input_file in options.input_files: | |
20 gene_count = summarize_counts(input_file = input_file, gene_count = gene_count) | |
21 | |
22 #delete the single counts | |
23 for k, v in gene_count.items(): | |
24 if v == 1: | |
25 del gene_count[k] | |
26 | |
27 # we do two passes because we don't know the order yet and don't want to hold onto everything in memory | |
28 | |
29 file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names) | |
30 | |
31 #get snpeff lines | |
32 gene_data = {} | |
33 all_headers = [] | |
34 for input_file in options.input_files: | |
35 headers = [] | |
36 headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary) | |
37 | |
38 if len(all_headers) == 0 : | |
39 all_headers.extend(headers) | |
40 | |
41 sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1)) | |
42 sorted_gene_count_list.reverse() | |
43 | |
44 print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file) | |
45 print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers) | |
46 | |
47 def create_file_sample_dictionary(input_files = None, sample_names = None): | |
48 file_sample_dictionary = {} | |
49 | |
50 for i in range(0, len(input_files)): | |
51 if i < len(sample_names): | |
52 file_sample_dictionary[input_files[i]] = sample_names[i] | |
53 else: | |
54 file_sample_dictionary[input_files[i]] = "" | |
55 | |
56 return file_sample_dictionary | |
57 | |
58 def split_input_files(option, opt_str, value, parser): | |
59 input_files = [] | |
60 collect = True | |
61 for i in range(0, len(parser.rargs)): | |
62 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]: | |
63 collect = False | |
64 elif collect == True: | |
65 input_files.append(parser.rargs[i]) | |
66 | |
67 setattr(parser.values, option.dest, input_files) | |
68 | |
69 def split_sample_names(option, opt_str, value, parser): | |
70 input_files = [] | |
71 collect = True | |
72 for i in range(0, len(parser.rargs)): | |
73 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]: | |
74 collect = False | |
75 elif collect == True: | |
76 input_files.append(parser.rargs[i]) | |
77 | |
78 setattr(parser.values, option.dest, input_files) | |
79 | |
80 | |
81 def print_summary(gene_count_list = None, summary_output_file = ""): | |
82 f = open(summary_output_file, 'w') | |
83 f.write('\n'.join('%s\t%s' % g for g in gene_count_list)) | |
84 f.close() | |
85 | |
86 def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None): | |
87 f = open(data_output_file, 'w') | |
88 | |
89 for header in all_headers: | |
90 f.write('Sample Name') | |
91 f.write('\t') | |
92 f.write(('\t').join(header)) | |
93 f.write('\n') | |
94 | |
95 for i in range(0, len(gene_count_list)): | |
96 gene_id = gene_count_list[i][0] | |
97 f.write(gene_data[gene_id] + "\n") | |
98 | |
99 f.close() | |
100 | |
101 def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None): | |
102 i_file = open(input_file, 'rU') | |
103 reader = csv.reader(i_file, delimiter = '\t') | |
104 headers = skip_and_collect_headers(reader = reader, i_file = i_file) | |
105 | |
106 sample_name = file_sample_dictionary[input_file] | |
107 | |
108 for row in reader: | |
109 gene_id = row[9].upper() | |
110 if gene_id in gene_count: | |
111 if gene_id in gene_data: | |
112 gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip() | |
113 else: | |
114 gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip() | |
115 | |
116 return headers, gene_data | |
117 | |
118 def summarize_counts(input_file= "", gene_count = {}): | |
119 i_file = open(input_file, 'rU') | |
120 reader = csv.reader(i_file, delimiter = '\t') | |
121 skip_and_collect_headers(reader = reader, i_file = i_file) | |
122 | |
123 counted_genes = [] | |
124 | |
125 for row in reader: | |
126 gene_id = row[9].upper() | |
127 if gene_id not in counted_genes: | |
128 if gene_id in gene_count and gene_id not in counted_genes: | |
129 gene_count[gene_id] += 1 | |
130 else: | |
131 gene_count[gene_id] = 1 | |
132 | |
133 counted_genes.append(gene_id) | |
134 | |
135 return gene_count | |
136 | |
137 def skip_and_collect_headers(reader = None, i_file = None): | |
138 headers = [] | |
139 | |
140 # count headers | |
141 comment = 0 | |
142 while True: | |
143 row = reader.next() | |
144 if len(row) == 0: | |
145 comment = comment + 1 | |
146 elif row[0].startswith('#'): | |
147 comment = comment + 1 | |
148 if comment == 3: | |
149 headers.append(row) | |
150 else: | |
151 break | |
152 | |
153 # skip headers | |
154 i_file.seek(0) | |
155 for i in range(0, comment): | |
156 reader.next() | |
157 | |
158 return headers | |
159 | |
160 if __name__ == "__main__": | |
161 main() |