Mercurial > repos > gregory-minevich > cloudmap_in_silico_complementation
view CloudMap_InSilico.py @ 1:5389dc5a0be3 draft default tip
Uploaded
author | gregory-minevich |
---|---|
date | Thu, 01 Nov 2012 19:26:05 -0400 |
parents | dff952ff16d9 |
children |
line wrap: on
line source
#!/usr/bin/python import sys import optparse import csv import operator import pprint def main(): parser = optparse.OptionParser() parser.add_option("-i", dest = "input_files", action = "callback", callback = split_input_files) parser.add_option("-n", dest = "sample_names", action = "callback", callback = split_sample_names) parser.add_option("-s", dest = "summary_output_file", action = "store", type = "string") parser.add_option("-o", dest = "data_output_file", action = "store", type = "string") (options, args) = parser.parse_args() #get the counts gene_count = {} for input_file in options.input_files: gene_count = summarize_counts(input_file = input_file, gene_count = gene_count) #delete the single counts for k, v in gene_count.items(): if v == 1: del gene_count[k] # we do two passes because we don't know the order yet and don't want to hold onto everything in memory file_sample_dictionary = create_file_sample_dictionary(input_files = options.input_files, sample_names = options.sample_names) #get snpeff lines gene_data = {} all_headers = [] for input_file in options.input_files: headers = [] headers, gene_data = grab_snpeff_data(input_file = input_file, gene_data = gene_data, gene_count = gene_count, file_sample_dictionary = file_sample_dictionary) if len(all_headers) == 0 : all_headers.extend(headers) sorted_gene_count_list = sorted(gene_count.iteritems(), key = operator.itemgetter(1)) sorted_gene_count_list.reverse() print_summary(gene_count_list = sorted_gene_count_list, summary_output_file = options.summary_output_file) print_sorted_snpeff_data(gene_data = gene_data, gene_count_list = sorted_gene_count_list, data_output_file = options.data_output_file, all_headers = all_headers) def create_file_sample_dictionary(input_files = None, sample_names = None): file_sample_dictionary = {} for i in range(0, len(input_files)): if i < len(sample_names): file_sample_dictionary[input_files[i]] = sample_names[i] else: file_sample_dictionary[input_files[i]] = "" return file_sample_dictionary def split_input_files(option, opt_str, value, parser): input_files = [] collect = True for i in range(0, len(parser.rargs)): if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-n" == parser.rargs[i]: collect = False elif collect == True: input_files.append(parser.rargs[i]) setattr(parser.values, option.dest, input_files) def split_sample_names(option, opt_str, value, parser): input_files = [] collect = True for i in range(0, len(parser.rargs)): if "-s" == parser.rargs[i] or "-o" == parser.rargs[i] or "-i" == parser.rargs[i]: collect = False elif collect == True: input_files.append(parser.rargs[i]) setattr(parser.values, option.dest, input_files) def print_summary(gene_count_list = None, summary_output_file = ""): f = open(summary_output_file, 'w') f.write('\n'.join('%s\t%s' % g for g in gene_count_list)) f.close() def print_sorted_snpeff_data(gene_data = {}, gene_count_list = None, data_output_file = "", file_sample_dictionary = None, all_headers = None): f = open(data_output_file, 'w') for header in all_headers: f.write('Sample Name') f.write('\t') f.write(('\t').join(header)) f.write('\n') for i in range(0, len(gene_count_list)): gene_id = gene_count_list[i][0] f.write(gene_data[gene_id] + "\n") f.close() def grab_snpeff_data(input_file = "", gene_data = {}, gene_count = {}, file_sample_dictionary = None): i_file = open(input_file, 'rU') reader = csv.reader(i_file, delimiter = '\t') headers = skip_and_collect_headers(reader = reader, i_file = i_file) sample_name = file_sample_dictionary[input_file] for row in reader: gene_id = row[9].upper() if gene_id in gene_count: if gene_id in gene_data: gene_data[gene_id] = gene_data[gene_id] + '\n' + sample_name + '\t' + '\t'.join(row).rstrip() else: gene_data[gene_id] = sample_name + '\t' + '\t'.join(row).rstrip() return headers, gene_data def summarize_counts(input_file= "", gene_count = {}): i_file = open(input_file, 'rU') reader = csv.reader(i_file, delimiter = '\t') skip_and_collect_headers(reader = reader, i_file = i_file) counted_genes = [] for row in reader: gene_id = row[9].upper() if gene_id not in counted_genes: if gene_id in gene_count and gene_id not in counted_genes: gene_count[gene_id] += 1 else: gene_count[gene_id] = 1 counted_genes.append(gene_id) return gene_count def skip_and_collect_headers(reader = None, i_file = None): headers = [] # count headers comment = 0 while True: row = reader.next() if len(row) == 0: comment = comment + 1 elif row[0].startswith('#'): comment = comment + 1 if comment == 3: headers.append(row) else: break # skip headers i_file.seek(0) for i in range(0, comment): reader.next() return headers if __name__ == "__main__": main()