Mercurial > repos > bebatut > combine_metaphlan2_humann2
comparison combine_metaphlan2_humann2.py @ 2:fdfb35745104 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit eea46077010e699403ce6995d7d4aac77b2e0b43"
author | bgruening |
---|---|
date | Wed, 19 Oct 2022 14:44:00 +0000 |
parents | e25efca0a49c |
children |
comparison
equal
deleted
inserted
replaced
1:e25efca0a49c | 2:fdfb35745104 |
---|---|
4 import argparse | 4 import argparse |
5 | 5 |
6 | 6 |
7 def extract_clade_abundance(metaphlan2_fp): | 7 def extract_clade_abundance(metaphlan2_fp): |
8 clade_abund = {} | 8 clade_abund = {} |
9 with open(metaphlan2_fp, 'r') as metaphlan2_f: | 9 with open(metaphlan2_fp, "r") as metaphlan2_f: |
10 for line in metaphlan2_f.readlines(): | 10 for line in metaphlan2_f.readlines(): |
11 if line.find('g__') == -1: | 11 if line.find("g__") == -1: |
12 continue | 12 continue |
13 | 13 |
14 split_line = line[:-1].split('\t') | 14 split_line = line[:-1].split("\t") |
15 taxo = split_line[0] | 15 taxo = split_line[0] |
16 abundance = split_line[1] | 16 abundance = split_line[1] |
17 | 17 |
18 genus = taxo[(taxo.find('g__')+3):] | 18 genus = taxo[(taxo.find("g__") + 3):] |
19 if genus.find('|') != -1: | 19 if genus.find("|") != -1: |
20 genus = genus[:(genus.find('|'))] | 20 genus = genus[: (genus.find("|"))] |
21 clade_abund.setdefault(genus, {'abundance': 0, 'species': {}}) | 21 clade_abund.setdefault(genus, {"abundance": 0, "species": {}}) |
22 if taxo.find('t__') != -1: | 22 if taxo.find("t__") != -1: |
23 continue | 23 continue |
24 elif taxo.find('s__') != -1: | 24 elif taxo.find("s__") != -1: |
25 species = taxo[(taxo.find('s__')+3):] | 25 species = taxo[(taxo.find("s__") + 3):] |
26 clade_abund[genus]['species'].setdefault( | 26 clade_abund[genus]["species"].setdefault(species, abundance) |
27 species, | |
28 abundance) | |
29 else: | 27 else: |
30 clade_abund[genus]['abundance'] = abundance | 28 clade_abund[genus]["abundance"] = abundance |
31 return clade_abund | 29 return clade_abund |
32 | 30 |
33 | 31 |
34 def compute_overall_abundance(humann2_fp): | 32 def compute_overall_abundance(humann2_fp): |
35 overall_abundance = 0 | 33 overall_abundance = 0 |
36 with open(humann2_fp, 'r') as humann2_f: | 34 with open(humann2_fp, "r") as humann2_f: |
37 for line in humann2_f.readlines(): | 35 for line in humann2_f.readlines(): |
38 if line.find('|') != -1 or line.startswith('#'): | 36 if line.find("|") != -1 or line.startswith("#"): |
39 continue | 37 continue |
40 split_line = line[:-1].split('\t') | 38 split_line = line[:-1].split("\t") |
41 overall_abundance += float(split_line[1]) | 39 overall_abundance += float(split_line[1]) |
42 return overall_abundance | 40 return overall_abundance |
43 | 41 |
44 | 42 |
45 def format_characteristic_name(name): | 43 def format_characteristic_name(name): |
46 formatted_n = name | 44 formatted_n = name |
47 formatted_n = formatted_n.replace('/', ' ') | 45 formatted_n = formatted_n.replace("/", " ") |
48 formatted_n = formatted_n.replace('-', ' ') | 46 formatted_n = formatted_n.replace("-", " ") |
49 formatted_n = formatted_n.replace("'", '') | 47 formatted_n = formatted_n.replace("'", "") |
50 if formatted_n.find('(') != -1 and formatted_n.find(')') != -1: | 48 if formatted_n.find("(") != -1 and formatted_n.find(")") != -1: |
51 open_bracket = formatted_n.find('(') | 49 open_bracket = formatted_n.find("(") |
52 close_bracket = formatted_n.find(')')+1 | 50 close_bracket = formatted_n.find(")") + 1 |
53 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] | 51 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] |
54 return formatted_n | 52 return formatted_n |
55 | 53 |
56 | 54 |
57 def combine_metaphlan2_humann2(args): | 55 def combine_metaphlan2_humann2(args): |
58 clade_abund = extract_clade_abundance(args.metaphlan2_fp) | 56 clade_abund = extract_clade_abundance(args.metaphlan2_fp) |
59 overall_abund = compute_overall_abundance(args.humann2_fp) | 57 overall_abund = compute_overall_abundance(args.humann2_fp) |
60 | 58 |
61 with open(args.output_fp, 'w') as output_f: | 59 with open(args.output_fp, "w") as output_f: |
62 s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t' | 60 s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t" |
63 s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type) | 61 s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type) |
64 output_f.write(s) | 62 output_f.write(s) |
65 with open(args.humann2_fp, 'r') as humann2_f: | 63 with open(args.humann2_fp, "r") as humann2_f: |
66 for line in humann2_f.readlines(): | 64 for line in humann2_f.readlines(): |
67 if line.find('|') == -1: | 65 if line.find("|") == -1: |
68 continue | 66 continue |
69 | 67 |
70 split_line = line[:-1].split('\t') | 68 split_line = line[:-1].split("\t") |
71 abundance = 100*float(split_line[1])/overall_abund | 69 abundance = 100 * float(split_line[1]) / overall_abund |
72 annotation = split_line[0].split('|') | 70 annotation = split_line[0].split("|") |
73 charact = annotation[0].split(':') | 71 charact = annotation[0].split(":") |
74 charact_id = charact[0] | 72 charact_id = charact[0] |
75 char_name = '' | 73 char_name = "" |
76 if len(charact) > 1: | 74 if len(charact) > 1: |
77 char_name = format_characteristic_name(charact[-1]) | 75 char_name = format_characteristic_name(charact[-1]) |
78 taxo = annotation[1].split('.') | 76 taxo = annotation[1].split(".") |
79 | 77 |
80 if taxo[0] == 'unclassified': | 78 if taxo[0] == "unclassified": |
81 continue | 79 continue |
82 genus = taxo[0][3:] | 80 genus = taxo[0][3:] |
83 species = taxo[1][3:] | 81 species = taxo[1][3:] |
84 | 82 |
85 if genus not in clade_abund: | 83 if genus not in clade_abund: |
86 print("no %s found in %s" % (genus, args.metaphlan2_fp)) | 84 print("no %s found in %s" % (genus, args.metaphlan2_fp)) |
87 continue | 85 continue |
88 if species not in clade_abund[genus]['species']: | 86 if species not in clade_abund[genus]["species"]: |
89 print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus)) | 87 print( |
88 "no %s found in %s for % s" | |
89 % (species, args.metaphlan2_fp, genus) | |
90 ) | |
90 continue | 91 continue |
91 | 92 |
92 s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance']) | 93 s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"]) |
93 s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species]) | 94 s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species]) |
94 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) | 95 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) |
95 output_f.write(s) | 96 output_f.write(s) |
96 | 97 |
97 | 98 |
98 if __name__ == '__main__': | 99 if __name__ == "__main__": |
99 parser = argparse.ArgumentParser() | 100 parser = argparse.ArgumentParser() |
100 parser.add_argument('--humann2_fp', required=True) | 101 parser.add_argument("--humann2_fp", required=True) |
101 parser.add_argument('--metaphlan2_fp', required=True) | 102 parser.add_argument("--metaphlan2_fp", required=True) |
102 parser.add_argument('--output_fp', required=True) | 103 parser.add_argument("--output_fp", required=True) |
103 parser.add_argument( | 104 parser.add_argument("--type", required=True, choices=["gene_families", "pathways"]) |
104 '--type', | |
105 required=True, | |
106 choices=['gene_families', 'pathways']) | |
107 args = parser.parse_args() | 105 args = parser.parse_args() |
108 | 106 |
109 combine_metaphlan2_humann2(args) | 107 combine_metaphlan2_humann2(args) |