comparison combine_metaphlan2_humann2.py @ 0:31394a0c0242 draft

planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit e6bee6545960c2a1ae3ca3031ec74d7c26d0b0ce-dirty
author bebatut
date Fri, 15 Apr 2016 09:15:21 -0400
parents
children e25efca0a49c
comparison
equal deleted inserted replaced
-1:000000000000 0:31394a0c0242
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 import os
6 import argparse
7 import re
8
9 def extract_clade_abundance(metaphlan2_filepath):
10 clade_abundance = {}
11 with open(metaphlan2_filepath, 'r') as metaphlan2_file:
12 for line in metaphlan2_file.readlines():
13 if line.find('g__') == -1:
14 continue
15
16 split_line = line[:-1].split('\t')
17 taxo = split_line[0]
18 abundance = split_line[1]
19
20 genus = taxo[(taxo.find('g__')+3):]
21 if genus.find('|') != -1:
22 genus = genus[:(genus.find('|'))]
23 clade_abundance.setdefault(genus, {'abundance':0, 'species':{}})
24 if taxo.find('t__') != -1:
25 continue
26 elif taxo.find('s__') != -1:
27 species = taxo[(taxo.find('s__')+3):]
28 clade_abundance[genus]['species'].setdefault(species, abundance)
29 else:
30 clade_abundance[genus]['abundance'] = abundance
31 return clade_abundance
32
33 def compute_overall_abundance(humann2_file):
34 overall_abundance = 0
35 with open(args.humann2_file, 'r') as humann2_file:
36 for line in humann2_file.readlines():
37 if line.find('|') != -1 or line.startswith('#'):
38 continue
39 split_line = line[:-1].split('\t')
40 overall_abundance += float(split_line[1])
41 return overall_abundance
42
43 def format_characteristic_name(name):
44 formatted_name = name
45 formatted_name = formatted_name.replace('/',' ')
46 formatted_name = formatted_name.replace('-',' ')
47 formatted_name = formatted_name.replace("'",'')
48 if formatted_name.find('(') != -1 and formatted_name.find(')') != -1:
49 open_bracket = formatted_name.find('(')
50 close_bracket = formatted_name.find(')')+1
51 formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:]
52 return formatted_name
53
54 def combine_metaphlan2_humann2(args):
55 clade_abundance = extract_clade_abundance(args.metaphlan2_file)
56 overall_abundance = compute_overall_abundance(args.humann2_file)
57
58 with open(args.output_file, 'w') as output_file:
59 output_file.write('genus\t')
60 output_file.write('genus_abundance\t')
61 output_file.write('species\t')
62 output_file.write('species_abundance\t')
63 output_file.write(args.type + '_id\t')
64 output_file.write(args.type + '_name\t')
65 output_file.write(args.type + '_abundance\n')
66 with open(args.humann2_file, 'r') as humann2_file:
67 for line in humann2_file.readlines():
68 if line.find('|') == -1:
69 continue
70
71 split_line = line[:-1].split('\t')
72 abundance = 100*float(split_line[1])/overall_abundance
73 annotation = split_line[0].split('|')
74 characteristic = annotation[0].split(':')
75 characteristic_id = characteristic[0]
76 characteristic_name = ''
77 if len(characteristic) > 1:
78 characteristic_name = format_characteristic_name(characteristic[-1])
79 taxo = annotation[1].split('.')
80
81 if taxo[0] == 'unclassified':
82 continue
83 genus = taxo[0][3:]
84 species = taxo[1][3:]
85
86 if not clade_abundance.has_key(genus):
87 print "no", genus, "found in", args.metaphlan2_file
88 continue
89 if not clade_abundance[genus]['species'].has_key(species):
90 print "no", species, "found in", args.metaphlan2_file,
91 print "for", genus
92 continue
93 output_file.write(genus + '\t')
94 output_file.write(clade_abundance[genus]['abundance'] + '\t')
95 output_file.write(species + '\t')
96 output_file.write(clade_abundance[genus]['species'][species] + '\t')
97 output_file.write(characteristic_id + '\t')
98 output_file.write(characteristic_name + '\t')
99 output_file.write(str(abundance) + '\n')
100
101 if __name__ == '__main__':
102 parser = argparse.ArgumentParser()
103 parser.add_argument('--humann2_file', required=True)
104 parser.add_argument('--metaphlan2_file', required=True)
105 parser.add_argument('--output_file', required=True)
106 parser.add_argument('--type', required=True,
107 choices = ['gene_families','pathways'])
108 args = parser.parse_args()
109
110 combine_metaphlan2_humann2(args)