comparison combine_metaphlan2_humann2.py @ 2:fdfb35745104 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author bgruening
date Wed, 19 Oct 2022 14:44:00 +0000
parents e25efca0a49c
children
comparison
equal deleted inserted replaced
1:e25efca0a49c 2:fdfb35745104
4 import argparse 4 import argparse
5 5
6 6
7 def extract_clade_abundance(metaphlan2_fp): 7 def extract_clade_abundance(metaphlan2_fp):
8 clade_abund = {} 8 clade_abund = {}
9 with open(metaphlan2_fp, 'r') as metaphlan2_f: 9 with open(metaphlan2_fp, "r") as metaphlan2_f:
10 for line in metaphlan2_f.readlines(): 10 for line in metaphlan2_f.readlines():
11 if line.find('g__') == -1: 11 if line.find("g__") == -1:
12 continue 12 continue
13 13
14 split_line = line[:-1].split('\t') 14 split_line = line[:-1].split("\t")
15 taxo = split_line[0] 15 taxo = split_line[0]
16 abundance = split_line[1] 16 abundance = split_line[1]
17 17
18 genus = taxo[(taxo.find('g__')+3):] 18 genus = taxo[(taxo.find("g__") + 3):]
19 if genus.find('|') != -1: 19 if genus.find("|") != -1:
20 genus = genus[:(genus.find('|'))] 20 genus = genus[: (genus.find("|"))]
21 clade_abund.setdefault(genus, {'abundance': 0, 'species': {}}) 21 clade_abund.setdefault(genus, {"abundance": 0, "species": {}})
22 if taxo.find('t__') != -1: 22 if taxo.find("t__") != -1:
23 continue 23 continue
24 elif taxo.find('s__') != -1: 24 elif taxo.find("s__") != -1:
25 species = taxo[(taxo.find('s__')+3):] 25 species = taxo[(taxo.find("s__") + 3):]
26 clade_abund[genus]['species'].setdefault( 26 clade_abund[genus]["species"].setdefault(species, abundance)
27 species,
28 abundance)
29 else: 27 else:
30 clade_abund[genus]['abundance'] = abundance 28 clade_abund[genus]["abundance"] = abundance
31 return clade_abund 29 return clade_abund
32 30
33 31
34 def compute_overall_abundance(humann2_fp): 32 def compute_overall_abundance(humann2_fp):
35 overall_abundance = 0 33 overall_abundance = 0
36 with open(humann2_fp, 'r') as humann2_f: 34 with open(humann2_fp, "r") as humann2_f:
37 for line in humann2_f.readlines(): 35 for line in humann2_f.readlines():
38 if line.find('|') != -1 or line.startswith('#'): 36 if line.find("|") != -1 or line.startswith("#"):
39 continue 37 continue
40 split_line = line[:-1].split('\t') 38 split_line = line[:-1].split("\t")
41 overall_abundance += float(split_line[1]) 39 overall_abundance += float(split_line[1])
42 return overall_abundance 40 return overall_abundance
43 41
44 42
45 def format_characteristic_name(name): 43 def format_characteristic_name(name):
46 formatted_n = name 44 formatted_n = name
47 formatted_n = formatted_n.replace('/', ' ') 45 formatted_n = formatted_n.replace("/", " ")
48 formatted_n = formatted_n.replace('-', ' ') 46 formatted_n = formatted_n.replace("-", " ")
49 formatted_n = formatted_n.replace("'", '') 47 formatted_n = formatted_n.replace("'", "")
50 if formatted_n.find('(') != -1 and formatted_n.find(')') != -1: 48 if formatted_n.find("(") != -1 and formatted_n.find(")") != -1:
51 open_bracket = formatted_n.find('(') 49 open_bracket = formatted_n.find("(")
52 close_bracket = formatted_n.find(')')+1 50 close_bracket = formatted_n.find(")") + 1
53 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] 51 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
54 return formatted_n 52 return formatted_n
55 53
56 54
57 def combine_metaphlan2_humann2(args): 55 def combine_metaphlan2_humann2(args):
58 clade_abund = extract_clade_abundance(args.metaphlan2_fp) 56 clade_abund = extract_clade_abundance(args.metaphlan2_fp)
59 overall_abund = compute_overall_abundance(args.humann2_fp) 57 overall_abund = compute_overall_abundance(args.humann2_fp)
60 58
61 with open(args.output_fp, 'w') as output_f: 59 with open(args.output_fp, "w") as output_f:
62 s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t' 60 s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t"
63 s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type) 61 s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type)
64 output_f.write(s) 62 output_f.write(s)
65 with open(args.humann2_fp, 'r') as humann2_f: 63 with open(args.humann2_fp, "r") as humann2_f:
66 for line in humann2_f.readlines(): 64 for line in humann2_f.readlines():
67 if line.find('|') == -1: 65 if line.find("|") == -1:
68 continue 66 continue
69 67
70 split_line = line[:-1].split('\t') 68 split_line = line[:-1].split("\t")
71 abundance = 100*float(split_line[1])/overall_abund 69 abundance = 100 * float(split_line[1]) / overall_abund
72 annotation = split_line[0].split('|') 70 annotation = split_line[0].split("|")
73 charact = annotation[0].split(':') 71 charact = annotation[0].split(":")
74 charact_id = charact[0] 72 charact_id = charact[0]
75 char_name = '' 73 char_name = ""
76 if len(charact) > 1: 74 if len(charact) > 1:
77 char_name = format_characteristic_name(charact[-1]) 75 char_name = format_characteristic_name(charact[-1])
78 taxo = annotation[1].split('.') 76 taxo = annotation[1].split(".")
79 77
80 if taxo[0] == 'unclassified': 78 if taxo[0] == "unclassified":
81 continue 79 continue
82 genus = taxo[0][3:] 80 genus = taxo[0][3:]
83 species = taxo[1][3:] 81 species = taxo[1][3:]
84 82
85 if genus not in clade_abund: 83 if genus not in clade_abund:
86 print("no %s found in %s" % (genus, args.metaphlan2_fp)) 84 print("no %s found in %s" % (genus, args.metaphlan2_fp))
87 continue 85 continue
88 if species not in clade_abund[genus]['species']: 86 if species not in clade_abund[genus]["species"]:
89 print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus)) 87 print(
88 "no %s found in %s for % s"
89 % (species, args.metaphlan2_fp, genus)
90 )
90 continue 91 continue
91 92
92 s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance']) 93 s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"])
93 s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species]) 94 s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species])
94 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) 95 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
95 output_f.write(s) 96 output_f.write(s)
96 97
97 98
98 if __name__ == '__main__': 99 if __name__ == "__main__":
99 parser = argparse.ArgumentParser() 100 parser = argparse.ArgumentParser()
100 parser.add_argument('--humann2_fp', required=True) 101 parser.add_argument("--humann2_fp", required=True)
101 parser.add_argument('--metaphlan2_fp', required=True) 102 parser.add_argument("--metaphlan2_fp", required=True)
102 parser.add_argument('--output_fp', required=True) 103 parser.add_argument("--output_fp", required=True)
103 parser.add_argument( 104 parser.add_argument("--type", required=True, choices=["gene_families", "pathways"])
104 '--type',
105 required=True,
106 choices=['gene_families', 'pathways'])
107 args = parser.parse_args() 105 args = parser.parse_args()
108 106
109 combine_metaphlan2_humann2(args) 107 combine_metaphlan2_humann2(args)