Mercurial > repos > thanhlv > merge_metaphlan_tables
comparison formatoutput.py @ 0:7a090a1bd6b5 draft default tip
planemo upload for repository https://github.com/quadram-institute-bioscience/galaxy-tools/tree/master/tools/metaphlan/
| author | thanhlv |
|---|---|
| date | Mon, 13 Feb 2023 11:37:33 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7a090a1bd6b5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 import argparse | |
| 5 import re | |
| 6 from pathlib import Path | |
| 7 | |
| 8 taxo_level = { | |
| 9 'k': 'kingdom', | |
| 10 'p': 'phylum', | |
| 11 'c': 'class', | |
| 12 'o': 'order', | |
| 13 'f': 'family', | |
| 14 'g': 'genus', | |
| 15 's': 'species', | |
| 16 't': 'strains'} | |
| 17 | |
| 18 | |
| 19 def split_levels(metaphlan_output_fp, out_dp, legacy_output): | |
| 20 ''' | |
| 21 Split default MetaPhlAn into a report for each taxonomic level | |
| 22 | |
| 23 :param metaphlan_output_fp: Path default MetaPhlAn output | |
| 24 :param out_dp: Path to output directory | |
| 25 :param legacy_output: Boolean for legacy output | |
| 26 ''' | |
| 27 # prepare output files | |
| 28 abund_f = { | |
| 29 'k': open(out_dp / Path('kingdom'), 'w'), | |
| 30 'p': open(out_dp / Path('phylum'), 'w'), | |
| 31 'c': open(out_dp / Path('class'), 'w'), | |
| 32 'o': open(out_dp / Path('order'), 'w'), | |
| 33 'f': open(out_dp / Path('family'), 'w'), | |
| 34 'g': open(out_dp / Path('genus'), 'w'), | |
| 35 's': open(out_dp / Path('species'), 'w'), | |
| 36 't': open(out_dp / Path('strains'), 'w') | |
| 37 } | |
| 38 for level in abund_f: | |
| 39 abund_f[level].write("%s\t" % taxo_level[level]) | |
| 40 if not legacy_output: | |
| 41 abund_f[level].write("%s_id\t" % taxo_level[level]) | |
| 42 abund_f[level].write("abundance\n") | |
| 43 | |
| 44 levels_number = len(taxo_level) | |
| 45 | |
| 46 with open(metaphlan_output_fp, 'r') as metaphlan_output_f: | |
| 47 with open(out_dp / Path('all'), 'w') as all_level_f: | |
| 48 # write header in all leve file | |
| 49 for level in ['k', 'p', 'c', 'o', 'f', 'g', 's', 't']: | |
| 50 all_level_f.write("%s\t" % taxo_level[level]) | |
| 51 if not legacy_output: | |
| 52 all_level_f.write("%s_id\t" % taxo_level[level]) | |
| 53 all_level_f.write("abundance\n") | |
| 54 | |
| 55 # parse metaphlan file | |
| 56 for line in metaphlan_output_f.readlines(): | |
| 57 # skip headers | |
| 58 if line.startswith("#"): | |
| 59 continue | |
| 60 | |
| 61 # skip UNKNOWN (v3) or UNCLASSIFIED (v4) lines in predicted taxon relative abundances | |
| 62 if "UNKNOWN" in line or 'UNCLASSIFIED' in line: | |
| 63 continue | |
| 64 | |
| 65 # spit lines | |
| 66 split_line = line[:-1].split('\t') | |
| 67 taxo_n = split_line[0].split('|') | |
| 68 if legacy_output: | |
| 69 abundance = split_line[1] | |
| 70 else: | |
| 71 taxo_id = split_line[1].split('|') | |
| 72 abundance = split_line[2] | |
| 73 | |
| 74 # get taxon name and ids | |
| 75 for i in range(len(taxo_n)): | |
| 76 taxo = taxo_n[i].split('__')[1] | |
| 77 taxo = taxo.replace("_", " ") | |
| 78 all_level_f.write("%s\t" % taxo) | |
| 79 if not legacy_output: | |
| 80 all_level_f.write("%s\t" % taxo_id[i]) | |
| 81 | |
| 82 # if not all taxon levels | |
| 83 for i in range(len(taxo_n), levels_number): | |
| 84 all_level_f.write('\t') | |
| 85 | |
| 86 all_level_f.write("%s\n" % abundance) | |
| 87 | |
| 88 # write | |
| 89 last_taxo_level = taxo_n[-1].split('__') | |
| 90 taxo = last_taxo_level[1].replace("_", " ") | |
| 91 level = last_taxo_level[0] | |
| 92 abund_f[level].write("%s\t" % taxo) | |
| 93 if not legacy_output: | |
| 94 abund_f[level].write("%s\t" % taxo_id[-1]) | |
| 95 abund_f[level].write("%s\n" % abundance) | |
| 96 | |
| 97 # close files | |
| 98 for taxo_level_f in abund_f: | |
| 99 abund_f[taxo_level_f].close() | |
| 100 | |
| 101 | |
| 102 def format_for_krona(metaphlan_output_fp, krona_out_fp): | |
| 103 ''' | |
| 104 Split default MetaPhlAn into a report for each taxonomic levKRONAel | |
| 105 | |
| 106 :param metaphlan_output_fp: Path default MetaPhlAn output | |
| 107 :param krona_out: Path to output file for Krona | |
| 108 ''' | |
| 109 re_replace = re.compile(r"\w__") | |
| 110 re_bar = re.compile(r"\|") | |
| 111 re_underscore = re.compile(r"_") | |
| 112 | |
| 113 with open(metaphlan_output_fp, 'r') as metaphlan_output_f: | |
| 114 with open(krona_out_fp, 'w') as krona_out_f: | |
| 115 for line in metaphlan_output_f.readlines(): | |
| 116 if "s__" in line: | |
| 117 x = line.rstrip().split('\t') | |
| 118 lineage = re.sub(re_bar, '', x[0]) | |
| 119 lineage = re.sub(re_replace, '\t', lineage) | |
| 120 lineage = re.sub(re_underscore, ' ', lineage) | |
| 121 krona_out_f.write("%s\t%s\n" % (x[-1], lineage)) | |
| 122 | |
| 123 | |
| 124 if __name__ == '__main__': | |
| 125 parser = argparse.ArgumentParser(description='Format MetaPhlAn output') | |
| 126 subparsers = parser.add_subparsers(dest='function') | |
| 127 # split_levels | |
| 128 split_levels_parser = subparsers.add_parser('split_levels', help='Split default MetaPhlAn into a report for each taxonomic level') | |
| 129 split_levels_parser.add_argument('--metaphlan_output', help="Path to default MetaPhlAn output") | |
| 130 split_levels_parser.add_argument('--outdir', help="Path to output directory") | |
| 131 split_levels_parser.add_argument('--legacy-output', dest='legacy_output', action='store_true', help="Old MetaPhlAn2 two columns output") | |
| 132 split_levels_parser.set_defaults(legacy_output=False) | |
| 133 # format_for_krona | |
| 134 format_for_krona_parser = subparsers.add_parser('format_for_krona', help='Split default MetaPhlAn into a report for each taxonomic level') | |
| 135 format_for_krona_parser.add_argument('--metaphlan_output', help="Path to default MetaPhlAn output") | |
| 136 format_for_krona_parser.add_argument('--krona_output', help="Path to Krona output directory") | |
| 137 | |
| 138 args = parser.parse_args() | |
| 139 | |
| 140 if args.function == 'split_levels': | |
| 141 split_levels( | |
| 142 Path(args.metaphlan_output), | |
| 143 Path(args.outdir), | |
| 144 args.legacy_output) | |
| 145 elif args.function == 'format_for_krona': | |
| 146 format_for_krona( | |
| 147 Path(args.metaphlan_output), | |
| 148 Path(args.krona_output)) |
