Mercurial > repos > bebatut > compare_humann2_output
comparison compare_humann2_output.py @ 2:05766022dfc4 draft
"planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/compare_humann2_output commit dc55dc3b5275d1d6aac390698c0c6e0ab8fbf2f7"
author | bebatut |
---|---|
date | Mon, 14 Sep 2020 13:50:30 +0000 |
parents | 9959fa526f1a |
children | eaa95ea1195c |
comparison
equal
deleted
inserted
replaced
1:c1aca37cb1fc | 2:05766022dfc4 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
3 | 3 |
4 import sys | |
5 import os | |
6 import argparse | 4 import argparse |
7 import re | |
8 | 5 |
9 def extract_abundances(filepath, nb_charact_to_extract): | 6 |
7 def extract_abundances(fp, nb_charact_to_extract): | |
10 abundances = {} | 8 abundances = {} |
11 more_abund_charact = [] | 9 more_abund_charact = [] |
12 abund_sum = 0 | 10 abund_sum = 0 |
13 with open(filepath, 'r') as abundance_file: | 11 with open(fp, 'r') as abundance_f: |
14 for line in abundance_file.readlines()[1:]: | 12 for line in abundance_f.readlines()[1:]: |
15 split_line = line[:-1].split('\t') | 13 split_line = line[:-1].split('\t') |
16 charact_id = split_line[0] | 14 charact_id = split_line[0] |
17 abund = float(split_line[1]) | 15 abund = float(split_line[1]) |
18 abundances[charact_id] = 100*abund | 16 abundances[charact_id] = 100*abund |
19 abund_sum += abundances[charact_id] | 17 abund_sum += abundances[charact_id] |
20 | 18 |
21 if len(more_abund_charact) < nb_charact_to_extract: | 19 if len(more_abund_charact) < nb_charact_to_extract: |
22 more_abund_charact.append(charact_id) | 20 more_abund_charact.append(charact_id) |
23 else: | 21 else: |
24 best_pos = None | 22 best_pos = None |
25 for i in range(len(more_abund_charact)-1,-1,-1): | 23 for i in range(len(more_abund_charact)-1, -1, -1): |
26 if abundances[more_abund_charact[i]] < abund: | 24 if abundances[more_abund_charact[i]] < abund: |
27 best_pos = i | 25 best_pos = i |
28 else: | 26 else: |
29 break | 27 break |
30 if best_pos != None: | 28 if best_pos is not None: |
31 tmp_more_abund_charact = more_abund_charact | 29 tmp_more_abund_charact = more_abund_charact |
32 more_abund_charact = tmp_more_abund_charact[:best_pos] | 30 more_abund_charact = tmp_more_abund_charact[:best_pos] |
33 more_abund_charact += [charact_id] | 31 more_abund_charact += [charact_id] |
34 more_abund_charact += tmp_more_abund_charact[best_pos:-1] | 32 more_abund_charact += tmp_more_abund_charact[best_pos:-1] |
35 return abundances, more_abund_charact | 33 return abundances, more_abund_charact |
36 | 34 |
35 | |
37 def format_characteristic_name(all_name): | 36 def format_characteristic_name(all_name): |
38 if all_name.find(':') != -1: | 37 if all_name.find(':') != -1: |
39 charact_id = all_name.split(':')[0] | 38 charact_id = all_name.split(':')[0] |
40 charact_name = all_name.split(':')[1][1:] | 39 char_name = all_name.split(':')[1][1:] |
41 else: | 40 else: |
42 charact_id = all_name | 41 charact_id = all_name |
43 charact_name = '' | 42 char_name = '' |
44 | 43 |
45 charact_name = charact_name.replace('/',' ') | 44 char_name = char_name.replace('/', ' ') |
46 charact_name = charact_name.replace('-',' ') | 45 char_name = char_name.replace('-', ' ') |
47 charact_name = charact_name.replace("'",'') | 46 char_name = char_name.replace("'", '') |
48 if charact_name.find('(') != -1 and charact_name.find(')') != -1: | 47 if char_name.find('(') != -1 and char_name.find(')') != -1: |
49 open_bracket = charact_name.find('(') | 48 open_bracket = char_name.find('(') |
50 close_bracket = charact_name.find(')')+1 | 49 close_bracket = char_name.find(')')+1 |
51 charact_name = charact_name[:open_bracket] + charact_name[close_bracket:] | 50 char_name = char_name[:open_bracket] + char_name[close_bracket:] |
52 return charact_id,charact_name | 51 return charact_id, char_name |
53 | 52 |
54 def write_more_abundant_charat(abundances,more_abund_charact, output_filepath): | 53 |
55 with open(output_filepath,'w') as output_file: | 54 def write_more_abundant_charat(abundances, more_abund_charact, output_fp): |
56 output_file.write('id\tname\t') | 55 with open(output_fp, 'w') as output_f: |
57 output_file.write('\t'.join(abundances.keys()) + '\n') | 56 output_f.write('id\tname\t%s\n' % '\t'.join(abundances.keys())) |
58 | 57 |
59 for mac in more_abund_charact: | 58 for mac in more_abund_charact: |
60 charact_id,charact_name = format_characteristic_name(mac) | 59 charact_id, charact_name = format_characteristic_name(mac) |
61 output_file.write(charact_id + '\t' + charact_name) | 60 output_f.write('%s\t%s' % (charact_id, charact_name)) |
62 for sample in abundances: | 61 for sample in abundances: |
63 abund = abundances[sample].get(mac, 0) | 62 abund = abundances[sample].get(mac, 0) |
64 output_file.write('\t' + str(abund)) | 63 output_f.write('\t%s' % (abund)) |
65 output_file.write('\n') | 64 output_f.write('\n') |
66 | 65 |
67 def extract_similar_characteristics(abundances, sim_output_filepath, | |
68 specific_output_files): | |
69 sim_characteristics = set(abundances[abundances.keys()[0]].keys()) | |
70 for sample in abundances.keys()[1:]: | |
71 sim_characteristics.intersection_update(abundances[sample].keys()) | |
72 print 'Similar between all samples:', len(sim_characteristics) | |
73 | 66 |
74 with open(sim_output_filepath, 'w') as sim_output_file: | 67 def extract_similar_characteristics(abund, sim_output_fp, output_files): |
75 sim_output_file.write('id\tname\t' + '\t'.join(abundances.keys()) + '\n') | 68 abund_keys = list(abund) |
69 sim_characteristics = set(abund[abund_keys[0]].keys()) | |
70 for sample in abund_keys[1:]: | |
71 sim_characteristics.intersection_update(abund[sample].keys()) | |
72 print('Similar between all samples: %s' % len(sim_characteristics)) | |
73 | |
74 with open(sim_output_fp, 'w') as sim_output_f: | |
75 sim_output_f.write('id\tname\t%s\n' % '\t'.join(abund_keys)) | |
76 for charact in list(sim_characteristics): | 76 for charact in list(sim_characteristics): |
77 charact_id,charact_name = format_characteristic_name(charact) | 77 charact_id, charact_name = format_characteristic_name(charact) |
78 sim_output_file.write(charact_id + '\t' + charact_name) | 78 sim_output_f.write('%s\t%s' % (charact_id, charact_name)) |
79 for sample in abundances.keys(): | 79 for sample in abund_keys: |
80 sim_output_file.write('\t' + str(abundances[sample][charact])) | 80 sim_output_f.write('\t%s' % abund[sample][charact]) |
81 sim_output_file.write('\n') | 81 sim_output_f.write('\n') |
82 | 82 |
83 print 'Specific to samples:' | 83 print('Specific to samples:') |
84 diff_characteristics = {} | 84 diff_char = {} |
85 for i in range(len(abundances.keys())): | 85 for i in range(len(abund_keys)): |
86 sample = abundances.keys()[i] | 86 sample = abund_keys[i] |
87 print ' ', sample, "" | 87 print(' %s' % sample ) |
88 print ' All:', len(abundances[sample].keys()) | 88 print(' All: %s' % len(abund[sample].keys())) |
89 diff_characteristics[sample] = set(abundances[sample].keys()) | 89 diff_char[sample] = set(abund[sample].keys()) |
90 diff_characteristics[sample].difference_update(sim_characteristics) | 90 diff_char[sample].difference_update(sim_characteristics) |
91 print ' Number of specific characteristics:', | 91 perc = 100*len(diff_char[sample])/(1.*len(abund[sample].keys())) |
92 print len(diff_characteristics[sample]) | 92 print(' Number of specific characteristics: %s' % len(diff_char[sample])) |
93 print ' Percentage of specific characteristics:', | 93 print(' Percentage of specific characteristics: %s' % perc) |
94 print 100*len(diff_characteristics[sample])/(1.*len(abundances[sample].keys())) | |
95 | 94 |
96 relative_abundance = 0 | 95 relative_abundance = 0 |
97 with open(specific_output_files[i], 'w') as output_file: | 96 with open(output_files[i], 'w') as output_f: |
98 output_file.write('id\tname\tabundances\n') | 97 output_f.write('id\tname\tabundances\n') |
99 for charact in list(diff_characteristics[sample]): | 98 for charact in list(diff_char[sample]): |
100 charact_id,charact_name = format_characteristic_name(charact) | 99 charact_id, charact_name = format_characteristic_name(charact) |
101 output_file.write(charact_id + '\t' + charact_name + '\t') | 100 output_f.write('%s\t%s' % (charact_id, charact_name)) |
102 output_file.write(str(abundances[sample][charact]) + '\n') | 101 output_f.write('%s\n' % abund[sample][charact]) |
103 relative_abundance += abundances[sample][charact] | 102 relative_abundance += abund[sample][charact] |
104 print ' Relative abundance of specific characteristics(%):', relative_abundance | 103 print(' Relative abundance of specific characteristics: %s' % relative_abundance) |
105 | 104 |
106 return sim_characteristics | 105 return sim_characteristics |
107 | 106 |
107 | |
108 def compare_humann2_output(args): | 108 def compare_humann2_output(args): |
109 abundances = {} | 109 abund = {} |
110 more_abund_charact = [] | 110 more_abund_charact = [] |
111 | 111 |
112 for i in range(len(args.sample_name)): | 112 for i in range(len(args.sample_name)): |
113 abundances[args.sample_name[i]], mac = extract_abundances(args.charact_input_file[i], | 113 abund[args.sample_name[i]], mac = extract_abundances( |
114 args.charact_input_fp[i], | |
114 args.most_abundant_characteristics_to_extract) | 115 args.most_abundant_characteristics_to_extract) |
115 more_abund_charact += mac | 116 more_abund_charact += mac |
116 | 117 |
117 write_more_abundant_charat(abundances, list(set(more_abund_charact)), | 118 write_more_abundant_charat( |
118 args.more_abundant_output_file) | 119 abund, |
119 sim_characteristics = extract_similar_characteristics(abundances, | 120 list(set(more_abund_charact)), |
120 args.similar_output_file, args.specific_output_file) | 121 args.more_abundant_output_fp) |
122 extract_similar_characteristics( | |
123 abund, | |
124 args.similar_output_fp, | |
125 args.specific_output_fp) | |
126 | |
121 | 127 |
122 if __name__ == '__main__': | 128 if __name__ == '__main__': |
123 parser = argparse.ArgumentParser() | 129 parser = argparse.ArgumentParser() |
124 parser.add_argument('--sample_name', required=True, action='append') | 130 parser.add_argument('--sample_name', required=True, action='append') |
125 parser.add_argument('--charact_input_file', required=True, action='append') | 131 parser.add_argument('--charact_input_fp', required=True, action='append') |
126 parser.add_argument('--most_abundant_characteristics_to_extract', required=True, | 132 parser.add_argument( |
127 type = int) | 133 '--most_abundant_characteristics_to_extract', |
128 parser.add_argument('--more_abundant_output_file', required=True) | 134 required=True, |
129 parser.add_argument('--similar_output_file', required=True) | 135 type=int) |
130 parser.add_argument('--specific_output_file', required=True,action='append') | 136 parser.add_argument('--more_abundant_output_fp', required=True) |
137 parser.add_argument('--similar_output_fp', required=True) | |
138 parser.add_argument( | |
139 '--specific_output_fp', | |
140 required=True, | |
141 action='append') | |
131 args = parser.parse_args() | 142 args = parser.parse_args() |
132 | 143 |
133 if len(args.sample_name) != len(args.charact_input_file): | 144 if len(args.sample_name) != len(args.charact_input_fp): |
134 raise ValueError("Same number of values (in same order) are expected for --sample_name and --charact_input_file") | 145 string = "Same number of values (in same order) are expected for " |
135 if len(args.sample_name) != len(args.specific_output_file): | 146 string += "--sample_name and --charact_input_fp" |
136 raise ValueError("Same number of values (in same order) are expected for --sample_name and --specific_output_file") | 147 raise ValueError(string) |
148 if len(args.sample_name) != len(args.specific_output_fp): | |
149 string = "Same number of values (in same order) are expected for " | |
150 string += "--sample_name and --specific_output_fp" | |
151 raise ValueError(string) | |
137 | 152 |
138 compare_humann2_output(args) | 153 compare_humann2_output(args) |