Mercurial > repos > bebatut > compare_humann2_output
diff compare_humann2_output.py @ 2:05766022dfc4 draft
"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/compare_humann2_output commit dc55dc3b5275d1d6aac390698c0c6e0ab8fbf2f7"
author | bebatut |
---|---|
date | Mon, 14 Sep 2020 13:50:30 +0000 |
parents | 9959fa526f1a |
children | eaa95ea1195c |
line wrap: on
line diff
--- a/compare_humann2_output.py Wed Apr 20 09:15:27 2016 -0400 +++ b/compare_humann2_output.py Mon Sep 14 13:50:30 2020 +0000 @@ -1,17 +1,15 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import sys -import os import argparse -import re + -def extract_abundances(filepath, nb_charact_to_extract): +def extract_abundances(fp, nb_charact_to_extract): abundances = {} more_abund_charact = [] abund_sum = 0 - with open(filepath, 'r') as abundance_file: - for line in abundance_file.readlines()[1:]: + with open(fp, 'r') as abundance_f: + for line in abundance_f.readlines()[1:]: split_line = line[:-1].split('\t') charact_id = split_line[0] abund = float(split_line[1]) @@ -22,117 +20,134 @@ more_abund_charact.append(charact_id) else: best_pos = None - for i in range(len(more_abund_charact)-1,-1,-1): + for i in range(len(more_abund_charact)-1, -1, -1): if abundances[more_abund_charact[i]] < abund: best_pos = i else: break - if best_pos != None: + if best_pos is not None: tmp_more_abund_charact = more_abund_charact more_abund_charact = tmp_more_abund_charact[:best_pos] more_abund_charact += [charact_id] more_abund_charact += tmp_more_abund_charact[best_pos:-1] return abundances, more_abund_charact + def format_characteristic_name(all_name): if all_name.find(':') != -1: charact_id = all_name.split(':')[0] - charact_name = all_name.split(':')[1][1:] + char_name = all_name.split(':')[1][1:] else: charact_id = all_name - charact_name = '' + char_name = '' - charact_name = charact_name.replace('/',' ') - charact_name = charact_name.replace('-',' ') - charact_name = charact_name.replace("'",'') - if charact_name.find('(') != -1 and charact_name.find(')') != -1: - open_bracket = charact_name.find('(') - close_bracket = charact_name.find(')')+1 - charact_name = charact_name[:open_bracket] + charact_name[close_bracket:] - return charact_id,charact_name + char_name = char_name.replace('/', ' ') + char_name = char_name.replace('-', ' ') + char_name = char_name.replace("'", '') + if char_name.find('(') != -1 and char_name.find(')') != -1: + open_bracket = char_name.find('(') + close_bracket = char_name.find(')')+1 + char_name = char_name[:open_bracket] + char_name[close_bracket:] + return charact_id, char_name -def write_more_abundant_charat(abundances,more_abund_charact, output_filepath): - with open(output_filepath,'w') as output_file: - output_file.write('id\tname\t') - output_file.write('\t'.join(abundances.keys()) + '\n') + +def write_more_abundant_charat(abundances, more_abund_charact, output_fp): + with open(output_fp, 'w') as output_f: + output_f.write('id\tname\t%s\n' % '\t'.join(abundances.keys())) for mac in more_abund_charact: - charact_id,charact_name = format_characteristic_name(mac) - output_file.write(charact_id + '\t' + charact_name) + charact_id, charact_name = format_characteristic_name(mac) + output_f.write('%s\t%s' % (charact_id, charact_name)) for sample in abundances: abund = abundances[sample].get(mac, 0) - output_file.write('\t' + str(abund)) - output_file.write('\n') + output_f.write('\t%s' % (abund)) + output_f.write('\n') + -def extract_similar_characteristics(abundances, sim_output_filepath, - specific_output_files): - sim_characteristics = set(abundances[abundances.keys()[0]].keys()) - for sample in abundances.keys()[1:]: - sim_characteristics.intersection_update(abundances[sample].keys()) - print 'Similar between all samples:', len(sim_characteristics) +def extract_similar_characteristics(abund, sim_output_fp, output_files): + abund_keys = list(abund) + sim_characteristics = set(abund[abund_keys[0]].keys()) + for sample in abund_keys[1:]: + sim_characteristics.intersection_update(abund[sample].keys()) + print('Similar between all samples: %s' % len(sim_characteristics)) - with open(sim_output_filepath, 'w') as sim_output_file: - sim_output_file.write('id\tname\t' + '\t'.join(abundances.keys()) + '\n') + with open(sim_output_fp, 'w') as sim_output_f: + sim_output_f.write('id\tname\t%s\n' % '\t'.join(abund_keys)) for charact in list(sim_characteristics): - charact_id,charact_name = format_characteristic_name(charact) - sim_output_file.write(charact_id + '\t' + charact_name) - for sample in abundances.keys(): - sim_output_file.write('\t' + str(abundances[sample][charact])) - sim_output_file.write('\n') + charact_id, charact_name = format_characteristic_name(charact) + sim_output_f.write('%s\t%s' % (charact_id, charact_name)) + for sample in abund_keys: + sim_output_f.write('\t%s' % abund[sample][charact]) + sim_output_f.write('\n') - print 'Specific to samples:' - diff_characteristics = {} - for i in range(len(abundances.keys())): - sample = abundances.keys()[i] - print ' ', sample, "" - print ' All:', len(abundances[sample].keys()) - diff_characteristics[sample] = set(abundances[sample].keys()) - diff_characteristics[sample].difference_update(sim_characteristics) - print ' Number of specific characteristics:', - print len(diff_characteristics[sample]) - print ' Percentage of specific characteristics:', - print 100*len(diff_characteristics[sample])/(1.*len(abundances[sample].keys())) + print('Specific to samples:') + diff_char = {} + for i in range(len(abund_keys)): + sample = abund_keys[i] + print(' %s' % sample ) + print(' All: %s' % len(abund[sample].keys())) + diff_char[sample] = set(abund[sample].keys()) + diff_char[sample].difference_update(sim_characteristics) + perc = 100*len(diff_char[sample])/(1.*len(abund[sample].keys())) + print(' Number of specific characteristics: %s' % len(diff_char[sample])) + print(' Percentage of specific characteristics: %s' % perc) relative_abundance = 0 - with open(specific_output_files[i], 'w') as output_file: - output_file.write('id\tname\tabundances\n') - for charact in list(diff_characteristics[sample]): - charact_id,charact_name = format_characteristic_name(charact) - output_file.write(charact_id + '\t' + charact_name + '\t') - output_file.write(str(abundances[sample][charact]) + '\n') - relative_abundance += abundances[sample][charact] - print ' Relative abundance of specific characteristics(%):', relative_abundance + with open(output_files[i], 'w') as output_f: + output_f.write('id\tname\tabundances\n') + for charact in list(diff_char[sample]): + charact_id, charact_name = format_characteristic_name(charact) + output_f.write('%s\t%s' % (charact_id, charact_name)) + output_f.write('%s\n' % abund[sample][charact]) + relative_abundance += abund[sample][charact] + print(' Relative abundance of specific characteristics: %s' % relative_abundance) return sim_characteristics + def compare_humann2_output(args): - abundances = {} + abund = {} more_abund_charact = [] for i in range(len(args.sample_name)): - abundances[args.sample_name[i]], mac = extract_abundances(args.charact_input_file[i], + abund[args.sample_name[i]], mac = extract_abundances( + args.charact_input_fp[i], args.most_abundant_characteristics_to_extract) more_abund_charact += mac - write_more_abundant_charat(abundances, list(set(more_abund_charact)), - args.more_abundant_output_file) - sim_characteristics = extract_similar_characteristics(abundances, - args.similar_output_file, args.specific_output_file) + write_more_abundant_charat( + abund, + list(set(more_abund_charact)), + args.more_abundant_output_fp) + extract_similar_characteristics( + abund, + args.similar_output_fp, + args.specific_output_fp) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--sample_name', required=True, action='append') - parser.add_argument('--charact_input_file', required=True, action='append') - parser.add_argument('--most_abundant_characteristics_to_extract', required=True, - type = int) - parser.add_argument('--more_abundant_output_file', required=True) - parser.add_argument('--similar_output_file', required=True) - parser.add_argument('--specific_output_file', required=True,action='append') + parser.add_argument('--charact_input_fp', required=True, action='append') + parser.add_argument( + '--most_abundant_characteristics_to_extract', + required=True, + type=int) + parser.add_argument('--more_abundant_output_fp', required=True) + parser.add_argument('--similar_output_fp', required=True) + parser.add_argument( + '--specific_output_fp', + required=True, + action='append') args = parser.parse_args() - if len(args.sample_name) != len(args.charact_input_file): - raise ValueError("Same number of values (in same order) are expected for --sample_name and --charact_input_file") - if len(args.sample_name) != len(args.specific_output_file): - raise ValueError("Same number of values (in same order) are expected for --sample_name and --specific_output_file") + if len(args.sample_name) != len(args.charact_input_fp): + string = "Same number of values (in same order) are expected for " + string += "--sample_name and --charact_input_fp" + raise ValueError(string) + if len(args.sample_name) != len(args.specific_output_fp): + string = "Same number of values (in same order) are expected for " + string += "--sample_name and --specific_output_fp" + raise ValueError(string) - compare_humann2_output(args) \ No newline at end of file + compare_humann2_output(args)