0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import sys
|
|
4 import optparse
|
|
5 import csv
|
|
6 import operator
|
|
7 import pprint
|
|
8
|
|
9 def main():
|
|
10 parser = optparse.OptionParser()
|
|
11 parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files)
|
|
12 parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names)
|
|
13 parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string")
|
|
14 parser.add_option("-o", dest = "data_output_file", action = "store", type = "string")
|
|
15 (options, args) = parser.parse_args()
|
|
16
|
|
17 #get the counts
|
|
18 gene_count = {}
|
|
19 for input_file in options.input_files:
|
|
20 gene_count = summarize_counts(input_file = input_file, gene_count = gene_count)
|
|
21
|
|
22 #delete the single counts
|
|
23 for k, v in gene_count.items():
|
|
24 if v == 1:
|
|
25 del gene_count[k]
|
|
26
|
|
27 # we do two passes because we don't know the order yet and don't want to hold onto everything in memory
|
|
28
|
|
29 file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names)
|
|
30
|
|
31 #get snpeff lines
|
|
32 gene_data = {}
|
|
33 all_headers = []
|
|
34 for input_file in options.input_files:
|
|
35 headers = []
|
|
36 headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary)
|
|
37
|
|
38 if len(all_headers) == 0 :
|
|
39 all_headers.extend(headers)
|
|
40
|
|
41 sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1))
|
|
42 sorted_gene_count_list.reverse()
|
|
43
|
|
44 print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file)
|
|
45 print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers)
|
|
46
|
|
47 def create_file_sample_dictionary(input_files = None, sample_names = None):
|
|
48 file_sample_dictionary = {}
|
|
49
|
|
50 for i in range(0, len(input_files)):
|
|
51 if i < len(sample_names):
|
|
52 file_sample_dictionary[input_files[i]] = sample_names[i]
|
|
53 else:
|
|
54 file_sample_dictionary[input_files[i]] = ""
|
|
55
|
|
56 return file_sample_dictionary
|
|
57
|
|
58 def split_input_files(option, opt_str, value, parser):
|
|
59 input_files = []
|
|
60 collect = True
|
|
61 for i in range(0, len(parser.rargs)):
|
|
62 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]:
|
|
63 collect = False
|
|
64 elif collect == True:
|
|
65 input_files.append(parser.rargs[i])
|
|
66
|
|
67 setattr(parser.values, option.dest, input_files)
|
|
68
|
|
69 def split_sample_names(option, opt_str, value, parser):
|
|
70 input_files = []
|
|
71 collect = True
|
|
72 for i in range(0, len(parser.rargs)):
|
|
73 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]:
|
|
74 collect = False
|
|
75 elif collect == True:
|
|
76 input_files.append(parser.rargs[i])
|
|
77
|
|
78 setattr(parser.values, option.dest, input_files)
|
|
79
|
|
80
|
|
81 def print_summary(gene_count_list = None, summary_output_file = ""):
|
|
82 f = open(summary_output_file, 'w')
|
|
83 f.write('\n'.join('%s\t%s' % g for g in gene_count_list))
|
|
84 f.close()
|
|
85
|
|
86 def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None):
|
|
87 f = open(data_output_file, 'w')
|
|
88
|
|
89 for header in all_headers:
|
|
90 f.write('Sample Name')
|
|
91 f.write('\t')
|
|
92 f.write(('\t').join(header))
|
|
93 f.write('\n')
|
|
94
|
|
95 for i in range(0, len(gene_count_list)):
|
|
96 gene_id = gene_count_list[i][0]
|
|
97 f.write(gene_data[gene_id] + "\n")
|
|
98
|
|
99 f.close()
|
|
100
|
|
101 def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None):
|
|
102 i_file = open(input_file, 'rU')
|
|
103 reader = csv.reader(i_file, delimiter = '\t')
|
|
104 headers = skip_and_collect_headers(reader = reader, i_file = i_file)
|
|
105
|
|
106 sample_name = file_sample_dictionary[input_file]
|
|
107
|
|
108 for row in reader:
|
|
109 gene_id = row[9].upper()
|
|
110 if gene_id in gene_count:
|
|
111 if gene_id in gene_data:
|
|
112 gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip()
|
|
113 else:
|
|
114 gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip()
|
|
115
|
|
116 return headers, gene_data
|
|
117
|
|
118 def summarize_counts(input_file= "", gene_count = {}):
|
|
119 i_file = open(input_file, 'rU')
|
|
120 reader = csv.reader(i_file, delimiter = '\t')
|
|
121 skip_and_collect_headers(reader = reader, i_file = i_file)
|
|
122
|
|
123 counted_genes = []
|
|
124
|
|
125 for row in reader:
|
|
126 gene_id = row[9].upper()
|
|
127 if gene_id not in counted_genes:
|
|
128 if gene_id in gene_count and gene_id not in counted_genes:
|
|
129 gene_count[gene_id] += 1
|
|
130 else:
|
|
131 gene_count[gene_id] = 1
|
|
132
|
|
133 counted_genes.append(gene_id)
|
|
134
|
|
135 return gene_count
|
|
136
|
|
137 def skip_and_collect_headers(reader = None, i_file = None):
|
|
138 headers = []
|
|
139
|
|
140 # count headers
|
|
141 comment = 0
|
|
142 while True:
|
|
143 row = reader.next()
|
|
144 if len(row) == 0:
|
|
145 comment = comment + 1
|
|
146 elif row[0].startswith('#'):
|
|
147 comment = comment + 1
|
|
148 if comment == 3:
|
|
149 headers.append(row)
|
|
150 else:
|
|
151 break
|
|
152
|
|
153 # skip headers
|
|
154 i_file.seek(0)
|
|
155 for i in range(0, comment):
|
|
156 reader.next()
|
|
157
|
|
158 return headers
|
|
159
|
|
160 if __name__ == "__main__":
|
|
161 main()
|