comparison compare_humann2_output.py @ 3:eaa95ea1195c draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/compare_humann2_output commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author bgruening
date Wed, 19 Oct 2022 14:49:06 +0000
parents 05766022dfc4
children
comparison
equal deleted inserted replaced
2:05766022dfc4 3:eaa95ea1195c
6 6
7 def extract_abundances(fp, nb_charact_to_extract): 7 def extract_abundances(fp, nb_charact_to_extract):
8 abundances = {} 8 abundances = {}
9 more_abund_charact = [] 9 more_abund_charact = []
10 abund_sum = 0 10 abund_sum = 0
11 with open(fp, 'r') as abundance_f: 11 with open(fp, "r") as abundance_f:
12 for line in abundance_f.readlines()[1:]: 12 for line in abundance_f.readlines()[1:]:
13 split_line = line[:-1].split('\t') 13 split_line = line[:-1].split("\t")
14 charact_id = split_line[0] 14 charact_id = split_line[0]
15 abund = float(split_line[1]) 15 abund = float(split_line[1])
16 abundances[charact_id] = 100*abund 16 abundances[charact_id] = 100 * abund
17 abund_sum += abundances[charact_id] 17 abund_sum += abundances[charact_id]
18 18
19 if len(more_abund_charact) < nb_charact_to_extract: 19 if len(more_abund_charact) < nb_charact_to_extract:
20 more_abund_charact.append(charact_id) 20 more_abund_charact.append(charact_id)
21 else: 21 else:
22 best_pos = None 22 best_pos = None
23 for i in range(len(more_abund_charact)-1, -1, -1): 23 for i in range(len(more_abund_charact) - 1, -1, -1):
24 if abundances[more_abund_charact[i]] < abund: 24 if abundances[more_abund_charact[i]] < abund:
25 best_pos = i 25 best_pos = i
26 else: 26 else:
27 break 27 break
28 if best_pos is not None: 28 if best_pos is not None:
32 more_abund_charact += tmp_more_abund_charact[best_pos:-1] 32 more_abund_charact += tmp_more_abund_charact[best_pos:-1]
33 return abundances, more_abund_charact 33 return abundances, more_abund_charact
34 34
35 35
36 def format_characteristic_name(all_name): 36 def format_characteristic_name(all_name):
37 if all_name.find(':') != -1: 37 if all_name.find(":") != -1:
38 charact_id = all_name.split(':')[0] 38 charact_id = all_name.split(":")[0]
39 char_name = all_name.split(':')[1][1:] 39 char_name = all_name.split(":")[1][1:]
40 else: 40 else:
41 charact_id = all_name 41 charact_id = all_name
42 char_name = '' 42 char_name = ""
43 43
44 char_name = char_name.replace('/', ' ') 44 char_name = char_name.replace("/", " ")
45 char_name = char_name.replace('-', ' ') 45 char_name = char_name.replace("-", " ")
46 char_name = char_name.replace("'", '') 46 char_name = char_name.replace("'", "")
47 if char_name.find('(') != -1 and char_name.find(')') != -1: 47 if char_name.find("(") != -1 and char_name.find(")") != -1:
48 open_bracket = char_name.find('(') 48 open_bracket = char_name.find("(")
49 close_bracket = char_name.find(')')+1 49 close_bracket = char_name.find(")") + 1
50 char_name = char_name[:open_bracket] + char_name[close_bracket:] 50 char_name = char_name[:open_bracket] + char_name[close_bracket:]
51 return charact_id, char_name 51 return charact_id, char_name
52 52
53 53
54 def write_more_abundant_charat(abundances, more_abund_charact, output_fp): 54 def write_more_abundant_charat(abundances, more_abund_charact, output_fp):
55 with open(output_fp, 'w') as output_f: 55 with open(output_fp, "w") as output_f:
56 output_f.write('id\tname\t%s\n' % '\t'.join(abundances.keys())) 56 output_f.write("id\tname\t%s\n" % "\t".join(abundances.keys()))
57 57
58 for mac in more_abund_charact: 58 for mac in more_abund_charact:
59 charact_id, charact_name = format_characteristic_name(mac) 59 charact_id, charact_name = format_characteristic_name(mac)
60 output_f.write('%s\t%s' % (charact_id, charact_name)) 60 output_f.write("%s\t%s" % (charact_id, charact_name))
61 for sample in abundances: 61 for sample in abundances:
62 abund = abundances[sample].get(mac, 0) 62 abund = abundances[sample].get(mac, 0)
63 output_f.write('\t%s' % (abund)) 63 output_f.write("\t%s" % (abund))
64 output_f.write('\n') 64 output_f.write("\n")
65 65
66 66
67 def extract_similar_characteristics(abund, sim_output_fp, output_files): 67 def extract_similar_characteristics(abund, sim_output_fp, output_files):
68 abund_keys = list(abund) 68 abund_keys = list(abund)
69 sim_characteristics = set(abund[abund_keys[0]].keys()) 69 sim_characteristics = set(abund[abund_keys[0]].keys())
70 for sample in abund_keys[1:]: 70 for sample in abund_keys[1:]:
71 sim_characteristics.intersection_update(abund[sample].keys()) 71 sim_characteristics.intersection_update(abund[sample].keys())
72 print('Similar between all samples: %s' % len(sim_characteristics)) 72 print("Similar between all samples: %s" % len(sim_characteristics))
73 73
74 with open(sim_output_fp, 'w') as sim_output_f: 74 with open(sim_output_fp, "w") as sim_output_f:
75 sim_output_f.write('id\tname\t%s\n' % '\t'.join(abund_keys)) 75 sim_output_f.write("id\tname\t%s\n" % "\t".join(abund_keys))
76 for charact in list(sim_characteristics): 76 for charact in list(sim_characteristics):
77 charact_id, charact_name = format_characteristic_name(charact) 77 charact_id, charact_name = format_characteristic_name(charact)
78 sim_output_f.write('%s\t%s' % (charact_id, charact_name)) 78 sim_output_f.write("%s\t%s" % (charact_id, charact_name))
79 for sample in abund_keys: 79 for sample in abund_keys:
80 sim_output_f.write('\t%s' % abund[sample][charact]) 80 sim_output_f.write("\t%s" % abund[sample][charact])
81 sim_output_f.write('\n') 81 sim_output_f.write("\n")
82 82
83 print('Specific to samples:') 83 print("Specific to samples:")
84 diff_char = {} 84 diff_char = {}
85 for i in range(len(abund_keys)): 85 for i in range(len(abund_keys)):
86 sample = abund_keys[i] 86 sample = abund_keys[i]
87 print(' %s' % sample ) 87 print(" %s" % sample)
88 print(' All: %s' % len(abund[sample].keys())) 88 print(" All: %s" % len(abund[sample].keys()))
89 diff_char[sample] = set(abund[sample].keys()) 89 diff_char[sample] = set(abund[sample].keys())
90 diff_char[sample].difference_update(sim_characteristics) 90 diff_char[sample].difference_update(sim_characteristics)
91 perc = 100*len(diff_char[sample])/(1.*len(abund[sample].keys())) 91 perc = 100 * len(diff_char[sample]) / (1.0 * len(abund[sample].keys()))
92 print(' Number of specific characteristics: %s' % len(diff_char[sample])) 92 print(" Number of specific characteristics: %s" % len(diff_char[sample]))
93 print(' Percentage of specific characteristics: %s' % perc) 93 print(" Percentage of specific characteristics: %s" % perc)
94 94
95 relative_abundance = 0 95 relative_abundance = 0
96 with open(output_files[i], 'w') as output_f: 96 with open(output_files[i], "w") as output_f:
97 output_f.write('id\tname\tabundances\n') 97 output_f.write("id\tname\tabundances\n")
98 for charact in list(diff_char[sample]): 98 for charact in list(diff_char[sample]):
99 charact_id, charact_name = format_characteristic_name(charact) 99 charact_id, charact_name = format_characteristic_name(charact)
100 output_f.write('%s\t%s' % (charact_id, charact_name)) 100 output_f.write("%s\t%s" % (charact_id, charact_name))
101 output_f.write('%s\n' % abund[sample][charact]) 101 output_f.write("%s\n" % abund[sample][charact])
102 relative_abundance += abund[sample][charact] 102 relative_abundance += abund[sample][charact]
103 print(' Relative abundance of specific characteristics: %s' % relative_abundance) 103 print(
104 " Relative abundance of specific characteristics: %s"
105 % relative_abundance
106 )
104 107
105 return sim_characteristics 108 return sim_characteristics
106 109
107 110
108 def compare_humann2_output(args): 111 def compare_humann2_output(args):
109 abund = {} 112 abund = {}
110 more_abund_charact = [] 113 more_abund_charact = []
111 114
112 for i in range(len(args.sample_name)): 115 for i in range(len(args.sample_name)):
113 abund[args.sample_name[i]], mac = extract_abundances( 116 abund[args.sample_name[i]], mac = extract_abundances(
114 args.charact_input_fp[i], 117 args.charact_input_fp[i], args.most_abundant_characteristics_to_extract
115 args.most_abundant_characteristics_to_extract) 118 )
116 more_abund_charact += mac 119 more_abund_charact += mac
117 120
118 write_more_abundant_charat( 121 write_more_abundant_charat(
119 abund, 122 abund, list(set(more_abund_charact)), args.more_abundant_output_fp
120 list(set(more_abund_charact)), 123 )
121 args.more_abundant_output_fp)
122 extract_similar_characteristics( 124 extract_similar_characteristics(
123 abund, 125 abund, args.similar_output_fp, args.specific_output_fp
124 args.similar_output_fp, 126 )
125 args.specific_output_fp)
126 127
127 128
128 if __name__ == '__main__': 129 if __name__ == "__main__":
129 parser = argparse.ArgumentParser() 130 parser = argparse.ArgumentParser()
130 parser.add_argument('--sample_name', required=True, action='append') 131 parser.add_argument("--sample_name", required=True, action="append")
131 parser.add_argument('--charact_input_fp', required=True, action='append') 132 parser.add_argument("--charact_input_fp", required=True, action="append")
132 parser.add_argument( 133 parser.add_argument(
133 '--most_abundant_characteristics_to_extract', 134 "--most_abundant_characteristics_to_extract", required=True, type=int
134 required=True, 135 )
135 type=int) 136 parser.add_argument("--more_abundant_output_fp", required=True)
136 parser.add_argument('--more_abundant_output_fp', required=True) 137 parser.add_argument("--similar_output_fp", required=True)
137 parser.add_argument('--similar_output_fp', required=True) 138 parser.add_argument("--specific_output_fp", required=True, action="append")
138 parser.add_argument(
139 '--specific_output_fp',
140 required=True,
141 action='append')
142 args = parser.parse_args() 139 args = parser.parse_args()
143 140
144 if len(args.sample_name) != len(args.charact_input_fp): 141 if len(args.sample_name) != len(args.charact_input_fp):
145 string = "Same number of values (in same order) are expected for " 142 string = "Same number of values (in same order) are expected for "
146 string += "--sample_name and --charact_input_fp" 143 string += "--sample_name and --charact_input_fp"