Mercurial > repos > bebatut > combine_metaphlan2_humann2
comparison combine_metaphlan2_humann2.py @ 0:31394a0c0242 draft
planemo upload for repository https://github.com/asaim/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit e6bee6545960c2a1ae3ca3031ec74d7c26d0b0ce-dirty
| author | bebatut |
|---|---|
| date | Fri, 15 Apr 2016 09:15:21 -0400 |
| parents | |
| children | e25efca0a49c |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:31394a0c0242 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 import sys | |
| 5 import os | |
| 6 import argparse | |
| 7 import re | |
| 8 | |
| 9 def extract_clade_abundance(metaphlan2_filepath): | |
| 10 clade_abundance = {} | |
| 11 with open(metaphlan2_filepath, 'r') as metaphlan2_file: | |
| 12 for line in metaphlan2_file.readlines(): | |
| 13 if line.find('g__') == -1: | |
| 14 continue | |
| 15 | |
| 16 split_line = line[:-1].split('\t') | |
| 17 taxo = split_line[0] | |
| 18 abundance = split_line[1] | |
| 19 | |
| 20 genus = taxo[(taxo.find('g__')+3):] | |
| 21 if genus.find('|') != -1: | |
| 22 genus = genus[:(genus.find('|'))] | |
| 23 clade_abundance.setdefault(genus, {'abundance':0, 'species':{}}) | |
| 24 if taxo.find('t__') != -1: | |
| 25 continue | |
| 26 elif taxo.find('s__') != -1: | |
| 27 species = taxo[(taxo.find('s__')+3):] | |
| 28 clade_abundance[genus]['species'].setdefault(species, abundance) | |
| 29 else: | |
| 30 clade_abundance[genus]['abundance'] = abundance | |
| 31 return clade_abundance | |
| 32 | |
| 33 def compute_overall_abundance(humann2_file): | |
| 34 overall_abundance = 0 | |
| 35 with open(args.humann2_file, 'r') as humann2_file: | |
| 36 for line in humann2_file.readlines(): | |
| 37 if line.find('|') != -1 or line.startswith('#'): | |
| 38 continue | |
| 39 split_line = line[:-1].split('\t') | |
| 40 overall_abundance += float(split_line[1]) | |
| 41 return overall_abundance | |
| 42 | |
| 43 def format_characteristic_name(name): | |
| 44 formatted_name = name | |
| 45 formatted_name = formatted_name.replace('/',' ') | |
| 46 formatted_name = formatted_name.replace('-',' ') | |
| 47 formatted_name = formatted_name.replace("'",'') | |
| 48 if formatted_name.find('(') != -1 and formatted_name.find(')') != -1: | |
| 49 open_bracket = formatted_name.find('(') | |
| 50 close_bracket = formatted_name.find(')')+1 | |
| 51 formatted_name = formatted_name[:open_bracket] + formatted_name[close_bracket:] | |
| 52 return formatted_name | |
| 53 | |
| 54 def combine_metaphlan2_humann2(args): | |
| 55 clade_abundance = extract_clade_abundance(args.metaphlan2_file) | |
| 56 overall_abundance = compute_overall_abundance(args.humann2_file) | |
| 57 | |
| 58 with open(args.output_file, 'w') as output_file: | |
| 59 output_file.write('genus\t') | |
| 60 output_file.write('genus_abundance\t') | |
| 61 output_file.write('species\t') | |
| 62 output_file.write('species_abundance\t') | |
| 63 output_file.write(args.type + '_id\t') | |
| 64 output_file.write(args.type + '_name\t') | |
| 65 output_file.write(args.type + '_abundance\n') | |
| 66 with open(args.humann2_file, 'r') as humann2_file: | |
| 67 for line in humann2_file.readlines(): | |
| 68 if line.find('|') == -1: | |
| 69 continue | |
| 70 | |
| 71 split_line = line[:-1].split('\t') | |
| 72 abundance = 100*float(split_line[1])/overall_abundance | |
| 73 annotation = split_line[0].split('|') | |
| 74 characteristic = annotation[0].split(':') | |
| 75 characteristic_id = characteristic[0] | |
| 76 characteristic_name = '' | |
| 77 if len(characteristic) > 1: | |
| 78 characteristic_name = format_characteristic_name(characteristic[-1]) | |
| 79 taxo = annotation[1].split('.') | |
| 80 | |
| 81 if taxo[0] == 'unclassified': | |
| 82 continue | |
| 83 genus = taxo[0][3:] | |
| 84 species = taxo[1][3:] | |
| 85 | |
| 86 if not clade_abundance.has_key(genus): | |
| 87 print "no", genus, "found in", args.metaphlan2_file | |
| 88 continue | |
| 89 if not clade_abundance[genus]['species'].has_key(species): | |
| 90 print "no", species, "found in", args.metaphlan2_file, | |
| 91 print "for", genus | |
| 92 continue | |
| 93 output_file.write(genus + '\t') | |
| 94 output_file.write(clade_abundance[genus]['abundance'] + '\t') | |
| 95 output_file.write(species + '\t') | |
| 96 output_file.write(clade_abundance[genus]['species'][species] + '\t') | |
| 97 output_file.write(characteristic_id + '\t') | |
| 98 output_file.write(characteristic_name + '\t') | |
| 99 output_file.write(str(abundance) + '\n') | |
| 100 | |
| 101 if __name__ == '__main__': | |
| 102 parser = argparse.ArgumentParser() | |
| 103 parser.add_argument('--humann2_file', required=True) | |
| 104 parser.add_argument('--metaphlan2_file', required=True) | |
| 105 parser.add_argument('--output_file', required=True) | |
| 106 parser.add_argument('--type', required=True, | |
| 107 choices = ['gene_families','pathways']) | |
| 108 args = parser.parse_args() | |
| 109 | |
| 110 combine_metaphlan2_humann2(args) |
