Mercurial > repos > bebatut > combine_metaphlan2_humann2
comparison combine_metaphlan2_humann2.py @ 2:fdfb35745104 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit eea46077010e699403ce6995d7d4aac77b2e0b43"
| author | bgruening |
|---|---|
| date | Wed, 19 Oct 2022 14:44:00 +0000 |
| parents | e25efca0a49c |
| children |
comparison
equal
deleted
inserted
replaced
| 1:e25efca0a49c | 2:fdfb35745104 |
|---|---|
| 4 import argparse | 4 import argparse |
| 5 | 5 |
| 6 | 6 |
| 7 def extract_clade_abundance(metaphlan2_fp): | 7 def extract_clade_abundance(metaphlan2_fp): |
| 8 clade_abund = {} | 8 clade_abund = {} |
| 9 with open(metaphlan2_fp, 'r') as metaphlan2_f: | 9 with open(metaphlan2_fp, "r") as metaphlan2_f: |
| 10 for line in metaphlan2_f.readlines(): | 10 for line in metaphlan2_f.readlines(): |
| 11 if line.find('g__') == -1: | 11 if line.find("g__") == -1: |
| 12 continue | 12 continue |
| 13 | 13 |
| 14 split_line = line[:-1].split('\t') | 14 split_line = line[:-1].split("\t") |
| 15 taxo = split_line[0] | 15 taxo = split_line[0] |
| 16 abundance = split_line[1] | 16 abundance = split_line[1] |
| 17 | 17 |
| 18 genus = taxo[(taxo.find('g__')+3):] | 18 genus = taxo[(taxo.find("g__") + 3):] |
| 19 if genus.find('|') != -1: | 19 if genus.find("|") != -1: |
| 20 genus = genus[:(genus.find('|'))] | 20 genus = genus[: (genus.find("|"))] |
| 21 clade_abund.setdefault(genus, {'abundance': 0, 'species': {}}) | 21 clade_abund.setdefault(genus, {"abundance": 0, "species": {}}) |
| 22 if taxo.find('t__') != -1: | 22 if taxo.find("t__") != -1: |
| 23 continue | 23 continue |
| 24 elif taxo.find('s__') != -1: | 24 elif taxo.find("s__") != -1: |
| 25 species = taxo[(taxo.find('s__')+3):] | 25 species = taxo[(taxo.find("s__") + 3):] |
| 26 clade_abund[genus]['species'].setdefault( | 26 clade_abund[genus]["species"].setdefault(species, abundance) |
| 27 species, | |
| 28 abundance) | |
| 29 else: | 27 else: |
| 30 clade_abund[genus]['abundance'] = abundance | 28 clade_abund[genus]["abundance"] = abundance |
| 31 return clade_abund | 29 return clade_abund |
| 32 | 30 |
| 33 | 31 |
| 34 def compute_overall_abundance(humann2_fp): | 32 def compute_overall_abundance(humann2_fp): |
| 35 overall_abundance = 0 | 33 overall_abundance = 0 |
| 36 with open(humann2_fp, 'r') as humann2_f: | 34 with open(humann2_fp, "r") as humann2_f: |
| 37 for line in humann2_f.readlines(): | 35 for line in humann2_f.readlines(): |
| 38 if line.find('|') != -1 or line.startswith('#'): | 36 if line.find("|") != -1 or line.startswith("#"): |
| 39 continue | 37 continue |
| 40 split_line = line[:-1].split('\t') | 38 split_line = line[:-1].split("\t") |
| 41 overall_abundance += float(split_line[1]) | 39 overall_abundance += float(split_line[1]) |
| 42 return overall_abundance | 40 return overall_abundance |
| 43 | 41 |
| 44 | 42 |
| 45 def format_characteristic_name(name): | 43 def format_characteristic_name(name): |
| 46 formatted_n = name | 44 formatted_n = name |
| 47 formatted_n = formatted_n.replace('/', ' ') | 45 formatted_n = formatted_n.replace("/", " ") |
| 48 formatted_n = formatted_n.replace('-', ' ') | 46 formatted_n = formatted_n.replace("-", " ") |
| 49 formatted_n = formatted_n.replace("'", '') | 47 formatted_n = formatted_n.replace("'", "") |
| 50 if formatted_n.find('(') != -1 and formatted_n.find(')') != -1: | 48 if formatted_n.find("(") != -1 and formatted_n.find(")") != -1: |
| 51 open_bracket = formatted_n.find('(') | 49 open_bracket = formatted_n.find("(") |
| 52 close_bracket = formatted_n.find(')')+1 | 50 close_bracket = formatted_n.find(")") + 1 |
| 53 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] | 51 formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:] |
| 54 return formatted_n | 52 return formatted_n |
| 55 | 53 |
| 56 | 54 |
| 57 def combine_metaphlan2_humann2(args): | 55 def combine_metaphlan2_humann2(args): |
| 58 clade_abund = extract_clade_abundance(args.metaphlan2_fp) | 56 clade_abund = extract_clade_abundance(args.metaphlan2_fp) |
| 59 overall_abund = compute_overall_abundance(args.humann2_fp) | 57 overall_abund = compute_overall_abundance(args.humann2_fp) |
| 60 | 58 |
| 61 with open(args.output_fp, 'w') as output_f: | 59 with open(args.output_fp, "w") as output_f: |
| 62 s = 'genus\tgenus_abundance\tspecies\tspecies_abundance\t' | 60 s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t" |
| 63 s = '%s\t%s_id\t%s_name\t%s_abundance\n' % (s, args.type, args.type, args.type) | 61 s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type) |
| 64 output_f.write(s) | 62 output_f.write(s) |
| 65 with open(args.humann2_fp, 'r') as humann2_f: | 63 with open(args.humann2_fp, "r") as humann2_f: |
| 66 for line in humann2_f.readlines(): | 64 for line in humann2_f.readlines(): |
| 67 if line.find('|') == -1: | 65 if line.find("|") == -1: |
| 68 continue | 66 continue |
| 69 | 67 |
| 70 split_line = line[:-1].split('\t') | 68 split_line = line[:-1].split("\t") |
| 71 abundance = 100*float(split_line[1])/overall_abund | 69 abundance = 100 * float(split_line[1]) / overall_abund |
| 72 annotation = split_line[0].split('|') | 70 annotation = split_line[0].split("|") |
| 73 charact = annotation[0].split(':') | 71 charact = annotation[0].split(":") |
| 74 charact_id = charact[0] | 72 charact_id = charact[0] |
| 75 char_name = '' | 73 char_name = "" |
| 76 if len(charact) > 1: | 74 if len(charact) > 1: |
| 77 char_name = format_characteristic_name(charact[-1]) | 75 char_name = format_characteristic_name(charact[-1]) |
| 78 taxo = annotation[1].split('.') | 76 taxo = annotation[1].split(".") |
| 79 | 77 |
| 80 if taxo[0] == 'unclassified': | 78 if taxo[0] == "unclassified": |
| 81 continue | 79 continue |
| 82 genus = taxo[0][3:] | 80 genus = taxo[0][3:] |
| 83 species = taxo[1][3:] | 81 species = taxo[1][3:] |
| 84 | 82 |
| 85 if genus not in clade_abund: | 83 if genus not in clade_abund: |
| 86 print("no %s found in %s" % (genus, args.metaphlan2_fp)) | 84 print("no %s found in %s" % (genus, args.metaphlan2_fp)) |
| 87 continue | 85 continue |
| 88 if species not in clade_abund[genus]['species']: | 86 if species not in clade_abund[genus]["species"]: |
| 89 print("no %s found in %s for % s" % (species, args.metaphlan2_fp, genus)) | 87 print( |
| 88 "no %s found in %s for % s" | |
| 89 % (species, args.metaphlan2_fp, genus) | |
| 90 ) | |
| 90 continue | 91 continue |
| 91 | 92 |
| 92 s = "%s\t%s\t" % (genus, clade_abund[genus]['abundance']) | 93 s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"]) |
| 93 s += "%s\t%s\t" % (species, clade_abund[genus]['species'][species]) | 94 s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species]) |
| 94 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) | 95 s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance) |
| 95 output_f.write(s) | 96 output_f.write(s) |
| 96 | 97 |
| 97 | 98 |
| 98 if __name__ == '__main__': | 99 if __name__ == "__main__": |
| 99 parser = argparse.ArgumentParser() | 100 parser = argparse.ArgumentParser() |
| 100 parser.add_argument('--humann2_fp', required=True) | 101 parser.add_argument("--humann2_fp", required=True) |
| 101 parser.add_argument('--metaphlan2_fp', required=True) | 102 parser.add_argument("--metaphlan2_fp", required=True) |
| 102 parser.add_argument('--output_fp', required=True) | 103 parser.add_argument("--output_fp", required=True) |
| 103 parser.add_argument( | 104 parser.add_argument("--type", required=True, choices=["gene_families", "pathways"]) |
| 104 '--type', | |
| 105 required=True, | |
| 106 choices=['gene_families', 'pathways']) | |
| 107 args = parser.parse_args() | 105 args = parser.parse_args() |
| 108 | 106 |
| 109 combine_metaphlan2_humann2(args) | 107 combine_metaphlan2_humann2(args) |
