Mercurial > repos > bebatut > combine_metaphlan2_humann2
comparison combine_metaphlan2_humann2.py @ 0:31394a0c0242 draft
planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit e6bee6545960c2a1ae3ca3031ec74d7c26d0b0ce-dirty
author | bebatut |
---|---|
date | Fri, 15 Apr 2016 09:15:21 -0400 |
parents | |
children | e25efca0a49c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:31394a0c0242 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 import sys | |
5 import os | |
6 import argparse | |
7 import re | |
8 | |
9 def extract_clade_abundance(metaphlan2_filepath): | |
10 clade_abundance = {} | |
11 with open(metaphlan2_filepath, 'r') as metaphlan2_file: | |
12 for line in metaphlan2_file.readlines(): | |
13 if line.find('g__') == -1: | |
14 continue | |
15 | |
16 split_line = line[:-1].split('\t') | |
17 taxo = split_line[0] | |
18 abundance = split_line[1] | |
19 | |
20 genus = taxo[(taxo.find('g__')+3):] | |
21 if genus.find('|') != -1: | |
22 genus = genus[:(genus.find('|'))] | |
23 clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) | |
24 if taxo.find('t__') != -1: | |
25 continue | |
26 elif taxo.find('s__') != -1: | |
27 species = taxo[(taxo.find('s__')+3):] | |
28 clade_abundance[genus]['species'].setdefault(species, abundance) | |
29 else: | |
30 clade_abundance[genus]['abundance'] = abundance | |
31 return clade_abundance | |
32 | |
33 def compute_overall_abundance(humann2_file): | |
34 overall_abundance = 0 | |
35 with open(args.humann2_file, 'r') as humann2_file: | |
36 for line in humann2_file.readlines(): | |
37 if line.find('|') != -1 or line.startswith('#'): | |
38 continue | |
39 split_line = line[:-1].split('\t') | |
40 overall_abundance += float(split_line[1]) | |
41 return overall_abundance | |
42 | |
43 def format_characteristic_name(name): | |
44 formatted_name = name | |
45 formatted_name = formatted_name.replace('/',' ') | |
46 formatted_name = formatted_name.replace('-',' ') | |
47 formatted_name = formatted_name.replace("'",'') | |
48 if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: | |
49 open_bracket = formatted_name.find('(') | |
50 close_bracket = formatted_name.find(')')+1 | |
51 formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] | |
52 return formatted_name | |
53 | |
54 def combine_metaphlan2_humann2(args): | |
55 clade_abundance = extract_clade_abundance(args.metaphlan2_file) | |
56 overall_abundance = compute_overall_abundance(args.humann2_file) | |
57 | |
58 with open(args.output_file, 'w') as output_file: | |
59 output_file.write('genus\t') | |
60 output_file.write('genus_abundance\t') | |
61 output_file.write('species\t') | |
62 output_file.write('species_abundance\t') | |
63 output_file.write(args.type + '_id\t') | |
64 output_file.write(args.type + '_name\t') | |
65 output_file.write(args.type + '_abundance\n') | |
66 with open(args.humann2_file, 'r') as humann2_file: | |
67 for line in humann2_file.readlines(): | |
68 if line.find('|') == -1: | |
69 continue | |
70 | |
71 split_line = line[:-1].split('\t') | |
72 abundance = 100*float(split_line[1])/overall_abundance | |
73 annotation = split_line[0].split('|') | |
74 characteristic = annotation[0].split(':') | |
75 characteristic_id = characteristic[0] | |
76 characteristic_name = '' | |
77 if len(characteristic) > 1: | |
78 characteristic_name = format_characteristic_name(characteristic[-1]) | |
79 taxo = annotation[1].split('.') | |
80 | |
81 if taxo[0] == 'unclassified': | |
82 continue | |
83 genus = taxo[0][3:] | |
84 species = taxo[1][3:] | |
85 | |
86 if not clade_abundance.has_key(genus): | |
87 print "no", genus, "found in", args.metaphlan2_file | |
88 continue | |
89 if not clade_abundance[genus]['species'].has_key(species): | |
90 print "no", species, "found in", args.metaphlan2_file, | |
91 print "for", genus | |
92 continue | |
93 output_file.write(genus + '\t') | |
94 output_file.write(clade_abundance[genus]['abundance'] + '\t') | |
95 output_file.write(species + '\t') | |
96 output_file.write(clade_abundance[genus]['species'][species] + '\t') | |
97 output_file.write(characteristic_id + '\t') | |
98 output_file.write(characteristic_name + '\t') | |
99 output_file.write(str(abundance) + '\n') | |
100 | |
101 if __name__ == '__main__': | |
102 parser = argparse.ArgumentParser() | |
103 parser.add_argument('--humann2_file', required=True) | |
104 parser.add_argument('--metaphlan2_file', required=True) | |
105 parser.add_argument('--output_file', required=True) | |
106 parser.add_argument('--type', required=True, | |
107 choices = ['gene_families','pathways']) | |
108 args = parser.parse_args() | |
109 | |
110 combine_metaphlan2_humann2(args) |