Mercurial > repos > gregory-minevich > cloudmap_in_silico_complementation
comparison CloudMap_InSilico.py @ 0:dff952ff16d9 draft
Uploaded
| author | gregory-minevich |
|---|---|
| date | Thu, 01 Nov 2012 19:25:47 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dff952ff16d9 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import sys | |
| 4 import optparse | |
| 5 import csv | |
| 6 import operator | |
| 7 import pprint | |
| 8 | |
| 9 def main(): | |
| 10 parser = optparse.OptionParser() | |
| 11 parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files) | |
| 12 parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names) | |
| 13 parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string") | |
| 14 parser.add_option("-o", dest = "data_output_file", action = "store", type = "string") | |
| 15 (options, args) = parser.parse_args() | |
| 16 | |
| 17 #get the counts | |
| 18 gene_count = {} | |
| 19 for input_file in options.input_files: | |
| 20 gene_count = summarize_counts(input_file = input_file, gene_count = gene_count) | |
| 21 | |
| 22 #delete the single counts | |
| 23 for k, v in gene_count.items(): | |
| 24 if v == 1: | |
| 25 del gene_count[k] | |
| 26 | |
| 27 # we do two passes because we don't know the order yet and don't want to hold onto everything in memory | |
| 28 | |
| 29 file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names) | |
| 30 | |
| 31 #get snpeff lines | |
| 32 gene_data = {} | |
| 33 all_headers = [] | |
| 34 for input_file in options.input_files: | |
| 35 headers = [] | |
| 36 headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary) | |
| 37 | |
| 38 if len(all_headers) == 0 : | |
| 39 all_headers.extend(headers) | |
| 40 | |
| 41 sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1)) | |
| 42 sorted_gene_count_list.reverse() | |
| 43 | |
| 44 print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file) | |
| 45 print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers) | |
| 46 | |
| 47 def create_file_sample_dictionary(input_files = None, sample_names = None): | |
| 48 file_sample_dictionary = {} | |
| 49 | |
| 50 for i in range(0, len(input_files)): | |
| 51 if i < len(sample_names): | |
| 52 file_sample_dictionary[input_files[i]] = sample_names[i] | |
| 53 else: | |
| 54 file_sample_dictionary[input_files[i]] = "" | |
| 55 | |
| 56 return file_sample_dictionary | |
| 57 | |
| 58 def split_input_files(option, opt_str, value, parser): | |
| 59 input_files = [] | |
| 60 collect = True | |
| 61 for i in range(0, len(parser.rargs)): | |
| 62 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]: | |
| 63 collect = False | |
| 64 elif collect == True: | |
| 65 input_files.append(parser.rargs[i]) | |
| 66 | |
| 67 setattr(parser.values, option.dest, input_files) | |
| 68 | |
| 69 def split_sample_names(option, opt_str, value, parser): | |
| 70 input_files = [] | |
| 71 collect = True | |
| 72 for i in range(0, len(parser.rargs)): | |
| 73 if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]: | |
| 74 collect = False | |
| 75 elif collect == True: | |
| 76 input_files.append(parser.rargs[i]) | |
| 77 | |
| 78 setattr(parser.values, option.dest, input_files) | |
| 79 | |
| 80 | |
| 81 def print_summary(gene_count_list = None, summary_output_file = ""): | |
| 82 f = open(summary_output_file, 'w') | |
| 83 f.write('\n'.join('%s\t%s' % g for g in gene_count_list)) | |
| 84 f.close() | |
| 85 | |
| 86 def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None): | |
| 87 f = open(data_output_file, 'w') | |
| 88 | |
| 89 for header in all_headers: | |
| 90 f.write('Sample Name') | |
| 91 f.write('\t') | |
| 92 f.write(('\t').join(header)) | |
| 93 f.write('\n') | |
| 94 | |
| 95 for i in range(0, len(gene_count_list)): | |
| 96 gene_id = gene_count_list[i][0] | |
| 97 f.write(gene_data[gene_id] + "\n") | |
| 98 | |
| 99 f.close() | |
| 100 | |
| 101 def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None): | |
| 102 i_file = open(input_file, 'rU') | |
| 103 reader = csv.reader(i_file, delimiter = '\t') | |
| 104 headers = skip_and_collect_headers(reader = reader, i_file = i_file) | |
| 105 | |
| 106 sample_name = file_sample_dictionary[input_file] | |
| 107 | |
| 108 for row in reader: | |
| 109 gene_id = row[9].upper() | |
| 110 if gene_id in gene_count: | |
| 111 if gene_id in gene_data: | |
| 112 gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip() | |
| 113 else: | |
| 114 gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip() | |
| 115 | |
| 116 return headers, gene_data | |
| 117 | |
| 118 def summarize_counts(input_file= "", gene_count = {}): | |
| 119 i_file = open(input_file, 'rU') | |
| 120 reader = csv.reader(i_file, delimiter = '\t') | |
| 121 skip_and_collect_headers(reader = reader, i_file = i_file) | |
| 122 | |
| 123 counted_genes = [] | |
| 124 | |
| 125 for row in reader: | |
| 126 gene_id = row[9].upper() | |
| 127 if gene_id not in counted_genes: | |
| 128 if gene_id in gene_count and gene_id not in counted_genes: | |
| 129 gene_count[gene_id] += 1 | |
| 130 else: | |
| 131 gene_count[gene_id] = 1 | |
| 132 | |
| 133 counted_genes.append(gene_id) | |
| 134 | |
| 135 return gene_count | |
| 136 | |
| 137 def skip_and_collect_headers(reader = None, i_file = None): | |
| 138 headers = [] | |
| 139 | |
| 140 # count headers | |
| 141 comment = 0 | |
| 142 while True: | |
| 143 row = reader.next() | |
| 144 if len(row) == 0: | |
| 145 comment = comment + 1 | |
| 146 elif row[0].startswith('#'): | |
| 147 comment = comment + 1 | |
| 148 if comment == 3: | |
| 149 headers.append(row) | |
| 150 else: | |
| 151 break | |
| 152 | |
| 153 # skip headers | |
| 154 i_file.seek(0) | |
| 155 for i in range(0, comment): | |
| 156 reader.next() | |
| 157 | |
| 158 return headers | |
| 159 | |
| 160 if __name__ == "__main__": | |
| 161 main() |
