changeset 0:00912e6e974f draft

metaphlan2 Huttenhower Lab: Initial upload
author george-weingart
date Sun, 24 Apr 2016 13:19:46 -0400
parents
children 085b26768dae
files metaphlan2_hutlab/metaphlan2.py metaphlan2_hutlab/metaphlan2.xml metaphlan2_hutlab/tool_dependencies.xml
diffstat 3 files changed, 1527 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.py	Sun Apr 24 13:19:46 2016 -0400
@@ -0,0 +1,1283 @@
+#!/usr/bin/env python
+
+from __future__ import with_statement 
+
+# ==============================================================================
+# MetaPhlAn v2.x: METAgenomic PHyLogenetic ANalysis for taxonomic classification
+#                 of metagenomic data
+#
+# Authors: Nicola Segata (nicola.segata@unitn.it), 
+#          Duy Tin Truong (duytin.truong@unitn.it)
+#
+# Please type "./metaphlan2.py -h" for usage help
+#
+# ==============================================================================
+
+__author__ = 'Nicola Segata (nicola.segata@unitn.it), Duy Tin Truong (duytin.truong@unitn.it)'
+__version__ = '2.5.0'
+__date__ = '28 April 2015'
+
+
+import sys
+import os
+import stat
+import re
+from binascii import b2a_uu 
+
+try:
+    import numpy as np 
+except ImportError:
+    sys.stderr.write("Error! numpy python library not detected!!\n")
+    sys.exit(1)
+import tempfile as tf
+import argparse as ap
+import subprocess as subp
+import multiprocessing as mp
+from collections import defaultdict as defdict
+import bz2 
+import itertools
+from distutils.version import LooseVersion
+try:
+    import cPickle as pickle
+except:
+    import pickle
+
+import cStringIO
+
+#*************************************************************
+#*  Imports related to biom file generation                  *
+#*************************************************************
+try:
+    import biom
+    import biom.table
+    import numpy as np
+except ImportError:
+    sys.stderr.write("Warning! Biom python library not detected!"
+                     "\n Exporting to biom format will not work!\n")
+try:
+    import json
+except ImportError:
+    sys.stderr.write("Warning! json python library not detected!"
+                     "\n Exporting to biom format will not work!\n")
+
+# This set contains the markers that after careful validation are found to have low precision or recall
+# We esclude the markers here to avoid generating a new marker DB when changing just few markers
+markers_to_exclude = \
+    set([
+        'NC_001782.1','GeneID:17099689','gi|419819595|ref|NZ_AJRE01000517.1|:1-118',
+        'GeneID:10498696', 'GeneID:10498710', 'GeneID:10498726', 'GeneID:10498735',
+        'GeneID:10498757', 'GeneID:10498760', 'GeneID:10498761', 'GeneID:10498763',
+        'GeneID:11294465', 'GeneID:14181982', 'GeneID:14182132', 'GeneID:14182146',
+        'GeneID:14182148', 'GeneID:14182328', 'GeneID:14182639', 'GeneID:14182647',
+        'GeneID:14182650', 'GeneID:14182663', 'GeneID:14182683', 'GeneID:14182684',
+        'GeneID:14182691', 'GeneID:14182803', 'GeneID:14296322', 'GeneID:1489077',
+        'GeneID:1489080', 'GeneID:1489081', 'GeneID:1489084', 'GeneID:1489085',
+        'GeneID:1489088', 'GeneID:1489089', 'GeneID:1489090', 'GeneID:1489528',
+        'GeneID:1489530', 'GeneID:1489531', 'GeneID:1489735', 'GeneID:1491873',
+        'GeneID:1491889', 'GeneID:1491962', 'GeneID:1491963', 'GeneID:1491964',
+        'GeneID:1491965', 'GeneID:17099689', 'GeneID:1724732', 'GeneID:17494231',
+        'GeneID:2546403', 'GeneID:2703374', 'GeneID:2703375', 'GeneID:2703498',
+        'GeneID:2703531', 'GeneID:2772983', 'GeneID:2772989', 'GeneID:2772991',
+        'GeneID:2772993', 'GeneID:2772995', 'GeneID:2773037', 'GeneID:2777387',
+        'GeneID:2777399', 'GeneID:2777400', 'GeneID:2777439', 'GeneID:2777493',
+        'GeneID:2777494', 'GeneID:3077424', 'GeneID:3160801', 'GeneID:3197323',
+        'GeneID:3197355', 'GeneID:3197400', 'GeneID:3197428', 'GeneID:3783722',
+        'GeneID:3783750', 'GeneID:3953004', 'GeneID:3959334', 'GeneID:3964368',
+        'GeneID:3964370', 'GeneID:4961452', 'GeneID:5075645', 'GeneID:5075646',
+        'GeneID:5075647', 'GeneID:5075648', 'GeneID:5075649', 'GeneID:5075650',
+        'GeneID:5075651', 'GeneID:5075652', 'GeneID:5075653', 'GeneID:5075654',
+        'GeneID:5075655', 'GeneID:5075656', 'GeneID:5075657', 'GeneID:5075658',
+        'GeneID:5075659', 'GeneID:5075660', 'GeneID:5075661', 'GeneID:5075662',
+        'GeneID:5075663', 'GeneID:5075664', 'GeneID:5075665', 'GeneID:5075667',
+        'GeneID:5075668', 'GeneID:5075669', 'GeneID:5075670', 'GeneID:5075671',
+        'GeneID:5075672', 'GeneID:5075673', 'GeneID:5075674', 'GeneID:5075675',
+        'GeneID:5075676', 'GeneID:5075677', 'GeneID:5075678', 'GeneID:5075679',
+        'GeneID:5075680', 'GeneID:5075681', 'GeneID:5075682', 'GeneID:5075683',
+        'GeneID:5075684', 'GeneID:5075685', 'GeneID:5075686', 'GeneID:5075687',
+        'GeneID:5075688', 'GeneID:5075689', 'GeneID:5075690', 'GeneID:5075691',
+        'GeneID:5075692', 'GeneID:5075693', 'GeneID:5075694', 'GeneID:5075695',
+        'GeneID:5075696', 'GeneID:5075697', 'GeneID:5075698', 'GeneID:5075700',
+        'GeneID:5075701', 'GeneID:5075702', 'GeneID:5075703', 'GeneID:5075704',
+        'GeneID:5075705', 'GeneID:5075707', 'GeneID:5075708', 'GeneID:5075709',
+        'GeneID:5075710', 'GeneID:5075711', 'GeneID:5075712', 'GeneID:5075713',
+        'GeneID:5075714', 'GeneID:5075715', 'GeneID:5075716', 'GeneID:5176189',
+        'GeneID:6803896', 'GeneID:6803915', 'GeneID:7944151', 'GeneID:927334',
+        'GeneID:927335', 'GeneID:927337', 'GeneID:940263', 'GeneID:9538324',
+        'NC_003977.1', 'gi|103485498|ref|NC_008048.1|:1941166-1942314',
+        'gi|108802856|ref|NC_008148.1|:1230231-1230875',
+        'gi|124806686|ref|XM_001350760.1|',
+        'gi|126661648|ref|NZ_AAXW01000149.1|:c1513-1341',
+        'gi|149172845|ref|NZ_ABBW01000029.1|:970-1270',
+        'gi|153883242|ref|NZ_ABDQ01000074.1|:79-541',
+        'gi|167031021|ref|NC_010322.1|:1834668-1835168',
+        'gi|171344510|ref|NZ_ABJO01001391.1|:1-116',
+        'gi|171346813|ref|NZ_ABJO01001728.1|:c109-1',
+        'gi|190640924|ref|NZ_ABRC01000948.1|:c226-44',
+        'gi|223045343|ref|NZ_ACEN01000042.1|:1-336',
+        'gi|224580998|ref|NZ_GG657387.1|:c114607-114002',
+        'gi|224993759|ref|NZ_ACFY01000068.1|:c357-1',
+        'gi|237784637|ref|NC_012704.1|:141000-142970',
+        'gi|237784637|ref|NC_012704.1|:c2048315-2047083',
+        'gi|240136783|ref|NC_012808.1|:1928224-1928961',
+        'gi|255319020|ref|NZ_ACVR01000025.1|:28698-29132',
+        'gi|260590341|ref|NZ_ACEO02000062.1|:c387-151',
+        'gi|262368201|ref|NZ_GG704964.1|:733100-733978',
+        'gi|262369811|ref|NZ_GG704966.1|:c264858-264520',
+        'gi|288559258|ref|NC_013790.1|:448046-451354',
+        'gi|288559258|ref|NC_013790.1|:532047-533942',
+        'gi|294794157|ref|NZ_GG770200.1|:245344-245619',
+        'gi|304372805|ref|NC_014448.1|:444677-445120',
+        'gi|304372805|ref|NC_014448.1|:707516-708268',
+        'gi|304372805|ref|NC_014448.1|:790263-792257',
+        'gi|304372805|ref|NC_014448.1|:c367313-364470',
+        'gi|304372805|ref|NC_014448.1|:c659144-658272',
+        'gi|304372805|ref|NC_014448.1|:c772578-770410',
+        'gi|304372805|ref|NC_014448.1|:c777901-777470',
+        'gi|306477407|ref|NZ_GG770409.1|:c1643877-1643338',
+        'gi|317120849|ref|NC_014831.1|:c891121-890144',
+        'gi|323356441|ref|NZ_GL698442.1|:560-682',
+        'gi|324996766|ref|NZ_BABV01000451.1|:10656-11579',
+        'gi|326579405|ref|NZ_AEGQ01000006.1|:2997-3791',
+        'gi|326579407|ref|NZ_AEGQ01000008.1|:c45210-44497',
+        'gi|326579433|ref|NZ_AEGQ01000034.1|:346-3699',
+        'gi|329889017|ref|NZ_GL883086.1|:586124-586804',
+        'gi|330822653|ref|NC_015422.1|:2024431-2025018',
+        'gi|335053104|ref|NZ_AFIL01000010.1|:c33862-32210',
+        'gi|339304121|ref|NZ_AEOR01000258.1|:c294-1',
+        'gi|339304277|ref|NZ_AEOR01000414.1|:1-812',
+        'gi|342211239|ref|NZ_AFUK01000001.1|:790086-790835',
+        'gi|342211239|ref|NZ_AFUK01000001.1|:c1579497-1578787',
+        'gi|342213707|ref|NZ_AFUJ01000005.1|:48315-48908',
+        'gi|355707189|ref|NZ_JH376566.1|:326756-326986',
+        'gi|355707384|ref|NZ_JH376567.1|:90374-91453',
+        'gi|355707384|ref|NZ_JH376567.1|:c388018-387605',
+        'gi|355708440|ref|NZ_JH376569.1|:c80380-79448',
+        'gi|358051729|ref|NZ_AEUN01000100.1|:c120-1',
+        'gi|365983217|ref|XM_003668394.1|',
+        'gi|377571722|ref|NZ_BAFD01000110.1|:c1267-29',
+        'gi|377684864|ref|NZ_CM001194.1|:c1159954-1159619',
+        'gi|377684864|ref|NZ_CM001194.1|:c4966-4196',
+        'gi|378759497|ref|NZ_AFXE01000152.1|:1628-2215',
+        'gi|378835506|ref|NC_016829.1|:112560-113342',
+        'gi|378835506|ref|NC_016829.1|:114945-115193',
+        'gi|378835506|ref|NC_016829.1|:126414-127151',
+        'gi|378835506|ref|NC_016829.1|:272056-272403',
+        'gi|378835506|ref|NC_016829.1|:272493-272786',
+        'gi|378835506|ref|NC_016829.1|:358647-360863',
+        'gi|378835506|ref|NC_016829.1|:37637-38185',
+        'gi|378835506|ref|NC_016829.1|:60012-60497',
+        'gi|378835506|ref|NC_016829.1|:606819-607427',
+        'gi|378835506|ref|NC_016829.1|:607458-607760',
+        'gi|378835506|ref|NC_016829.1|:826192-826821',
+        'gi|378835506|ref|NC_016829.1|:c451932-451336',
+        'gi|378835506|ref|NC_016829.1|:c460520-459951',
+        'gi|378835506|ref|NC_016829.1|:c483843-482842',
+        'gi|378835506|ref|NC_016829.1|:c544660-543638',
+        'gi|378835506|ref|NC_016829.1|:c556383-555496',
+        'gi|378835506|ref|NC_016829.1|:c632166-631228',
+        'gi|378835506|ref|NC_016829.1|:c805066-802691',
+        'gi|384124469|ref|NC_017160.1|:c2157447-2156863',
+        'gi|385263288|ref|NZ_AJST01000001.1|:594143-594940',
+        'gi|385858114|ref|NC_017519.1|:10252-10746',
+        'gi|385858114|ref|NC_017519.1|:104630-104902',
+        'gi|385858114|ref|NC_017519.1|:154292-156016',
+        'gi|385858114|ref|NC_017519.1|:205158-206462',
+        'gi|385858114|ref|NC_017519.1|:507239-507703',
+        'gi|385858114|ref|NC_017519.1|:518924-519772',
+        'gi|385858114|ref|NC_017519.1|:524712-525545',
+        'gi|385858114|ref|NC_017519.1|:528387-528785',
+        'gi|385858114|ref|NC_017519.1|:532275-533429',
+        'gi|385858114|ref|NC_017519.1|:586402-586824',
+        'gi|385858114|ref|NC_017519.1|:621696-622226',
+        'gi|385858114|ref|NC_017519.1|:673673-676105',
+        'gi|385858114|ref|NC_017519.1|:706602-708218',
+        'gi|385858114|ref|NC_017519.1|:710627-711997',
+        'gi|385858114|ref|NC_017519.1|:744974-745456',
+        'gi|385858114|ref|NC_017519.1|:791055-791801',
+        'gi|385858114|ref|NC_017519.1|:805643-807430',
+        'gi|385858114|ref|NC_017519.1|:c172050-170809',
+        'gi|385858114|ref|NC_017519.1|:c334545-333268',
+        'gi|385858114|ref|NC_017519.1|:c383474-383202',
+        'gi|385858114|ref|NC_017519.1|:c450880-450389',
+        'gi|385858114|ref|NC_017519.1|:c451975-451001',
+        'gi|385858114|ref|NC_017519.1|:c470488-470036',
+        'gi|385858114|ref|NC_017519.1|:c485596-484598',
+        'gi|385858114|ref|NC_017519.1|:c58658-58065',
+        'gi|385858114|ref|NC_017519.1|:c592754-591081',
+        'gi|385858114|ref|NC_017519.1|:c59590-58820',
+        'gi|385858114|ref|NC_017519.1|:c601339-600575',
+        'gi|385858114|ref|NC_017519.1|:c76080-75160',
+        'gi|385858114|ref|NC_017519.1|:c97777-96302',
+        'gi|391227518|ref|NZ_CM001514.1|:c1442504-1440237',
+        'gi|391227518|ref|NZ_CM001514.1|:c3053472-3053023',
+        'gi|394749766|ref|NZ_AHHC01000069.1|:3978-6176',
+        'gi|398899615|ref|NZ_AKJK01000021.1|:28532-29209',
+        'gi|406580057|ref|NZ_AJRD01000017.1|:c17130-15766',
+        'gi|406584668|ref|NZ_AJQZ01000017.1|:c1397-771',
+        'gi|408543458|ref|NZ_AJLO01000024.1|:67702-68304',
+        'gi|410936685|ref|NZ_AJRF02000012.1|:21785-22696',
+        'gi|41406098|ref|NC_002944.2|:c4468304-4467864',
+        'gi|416998679|ref|NZ_AEXI01000003.1|:c562937-562176',
+        'gi|417017738|ref|NZ_AEYL01000489.1|:c111-1',
+        'gi|417018375|ref|NZ_AEYL01000508.1|:100-238',
+        'gi|418576506|ref|NZ_AHKB01000025.1|:c7989-7669',
+        'gi|419819595|ref|NZ_AJRE01000517.1|:1-118',
+        'gi|421806549|ref|NZ_AMTB01000006.1|:c181247-180489',
+        'gi|422320815|ref|NZ_GL636045.1|:28704-29048',
+        'gi|422320874|ref|NZ_GL636046.1|:4984-5742',
+        'gi|422323244|ref|NZ_GL636061.1|:479975-480520',
+        'gi|422443048|ref|NZ_GL383112.1|:663738-664823',
+        'gi|422552858|ref|NZ_GL383469.1|:c216727-215501',
+        'gi|422859491|ref|NZ_GL878548.1|:c271832-271695',
+        'gi|423012810|ref|NZ_GL982453.1|:3888672-3888935',
+        'gi|423012810|ref|NZ_GL982453.1|:4541873-4542328',
+        'gi|423012810|ref|NZ_GL982453.1|:c2189976-2188582',
+        'gi|423012810|ref|NZ_GL982453.1|:c5471232-5470300',
+        'gi|423262555|ref|NC_019552.1|:24703-25212',
+        'gi|423262555|ref|NC_019552.1|:28306-30696',
+        'gi|423262555|ref|NC_019552.1|:284252-284581',
+        'gi|423262555|ref|NC_019552.1|:311161-311373',
+        'gi|423262555|ref|NC_019552.1|:32707-34497',
+        'gi|423262555|ref|NC_019552.1|:34497-35237',
+        'gi|423262555|ref|NC_019552.1|:53691-56813',
+        'gi|423262555|ref|NC_019552.1|:c388986-386611',
+        'gi|423262555|ref|NC_019552.1|:c523106-522528',
+        'gi|423689090|ref|NZ_CM001513.1|:c1700632-1699448',
+        'gi|423689090|ref|NZ_CM001513.1|:c1701670-1700651',
+        'gi|423689090|ref|NZ_CM001513.1|:c5739118-5738390',
+        'gi|427395956|ref|NZ_JH992914.1|:c592682-591900',
+        'gi|427407324|ref|NZ_JH992904.1|:c2681223-2679463',
+        'gi|451952303|ref|NZ_AJRB03000021.1|:1041-1574',
+        'gi|452231579|ref|NZ_AEKA01000123.1|:c18076-16676',
+        'gi|459791914|ref|NZ_CM001824.1|:c899379-899239',
+        'gi|471265562|ref|NC_020815.1|:3155799-3156695',
+        'gi|472279780|ref|NZ_ALPV02000001.1|:33911-36751',
+        'gi|482733945|ref|NZ_AHGZ01000071.1|:10408-11154',
+        'gi|483051300|ref|NZ_ALYK02000034.1|:c37582-36650',
+        'gi|483051300|ref|NZ_ALYK02000034.1|:c38037-37582',
+        'gi|483993347|ref|NZ_AMXG01000045.1|:251724-253082',
+        'gi|484100856|ref|NZ_JH670250.1|:600643-602949',
+        'gi|484115941|ref|NZ_AJXG01000093.1|:567-947',
+        'gi|484228609|ref|NZ_JH730929.1|:c103784-99021',
+        'gi|484228797|ref|NZ_JH730960.1|:c16193-12429',
+        'gi|484228814|ref|NZ_JH730962.1|:c29706-29260',
+        'gi|484228929|ref|NZ_JH730981.1|:18645-22060',
+        'gi|484228939|ref|NZ_JH730983.1|:42943-43860',
+        'gi|484266598|ref|NZ_AKGC01000024.1|:118869-119636',
+        'gi|484327375|ref|NZ_AKVP01000093.1|:1-1281',
+        'gi|484328234|ref|NZ_AKVP01000127.1|:c325-110',
+        'gi|487376144|ref|NZ_KB911257.1|:600445-601482',
+        'gi|487376194|ref|NZ_KB911260.1|:146228-146533',
+        'gi|487381776|ref|NZ_KB911485.1|:101242-103083',
+        'gi|487381776|ref|NZ_KB911485.1|:c32472-31627',
+        'gi|487381800|ref|NZ_KB911486.1|:39414-39872',
+        'gi|487381828|ref|NZ_KB911487.1|:15689-17026',
+        'gi|487381846|ref|NZ_KB911488.1|:13678-13821',
+        'gi|487382089|ref|NZ_KB911497.1|:23810-26641',
+        'gi|487382176|ref|NZ_KB911501.1|:c497-381',
+        'gi|487382213|ref|NZ_KB911502.1|:12706-13119',
+        'gi|487382247|ref|NZ_KB911505.1|:c7595-6663',
+        'gi|490551798|ref|NZ_AORG01000011.1|:40110-41390',
+        'gi|491099398|ref|NZ_KB849654.1|:c720460-719912',
+        'gi|491124812|ref|NZ_KB849705.1|:1946500-1946937',
+        'gi|491155563|ref|NZ_KB849732.1|:46469-46843',
+        'gi|491155563|ref|NZ_KB849732.1|:46840-47181',
+        'gi|491155563|ref|NZ_KB849732.1|:47165-48616',
+        'gi|491155563|ref|NZ_KB849732.1|:55055-56662',
+        'gi|491155563|ref|NZ_KB849732.1|:56662-57351',
+        'gi|491155563|ref|NZ_KB849732.1|:6101-7588',
+        'gi|491155563|ref|NZ_KB849732.1|:7657-8073',
+        'gi|491349766|ref|NZ_KB850082.1|:441-941',
+        'gi|491395079|ref|NZ_KB850142.1|:1461751-1462554',
+        'gi|512608407|ref|NZ_KE150401.1|:c156891-156016',
+        'gi|518653462|ref|NZ_ATLM01000004.1|:c89669-89247',
+        'gi|520818261|ref|NZ_ATLQ01000015.1|:480744-481463',
+        'gi|520822538|ref|NZ_ATLQ01000063.1|:103173-103283',
+        'gi|520826510|ref|NZ_ATLQ01000092.1|:c13892-13563',
+        'gi|544644736|ref|NZ_KE747865.1|:68388-69722',
+        'gi|545347918|ref|NZ_KE952096.1|:c83873-81831',
+        'gi|550735774|gb|AXMM01000002.1|:c743886-743575',
+        'gi|552875787|ref|NZ_KI515684.1|:c584270-583890',
+        'gi|552876418|ref|NZ_KI515685.1|:36713-37258',
+        'gi|552876418|ref|NZ_KI515685.1|:432422-433465',
+        'gi|552876418|ref|NZ_KI515685.1|:c1014617-1014117',
+        'gi|552876418|ref|NZ_KI515685.1|:c931935-931327',
+        'gi|552876815|ref|NZ_KI515686.1|:613740-614315',
+        'gi|552879811|ref|NZ_AXME01000001.1|:1146402-1146932',
+        'gi|552879811|ref|NZ_AXME01000001.1|:40840-41742',
+        'gi|552879811|ref|NZ_AXME01000001.1|:49241-49654',
+        'gi|552891898|ref|NZ_AXMG01000001.1|:99114-99290',
+        'gi|552891898|ref|NZ_AXMG01000001.1|:c1460921-1460529',
+        'gi|552895565|ref|NZ_AXMI01000001.1|:619555-620031',
+        'gi|552895565|ref|NZ_AXMI01000001.1|:c14352-13837',
+        'gi|552896371|ref|NZ_AXMI01000002.1|:c148595-146280',
+        'gi|552897201|ref|NZ_AXMI01000004.1|:c231437-230883',
+        'gi|552902020|ref|NZ_AXMK01000001.1|:c1625038-1624022',
+        'gi|556346902|ref|NZ_KI535485.1|:c828278-827901',
+        'gi|556478613|ref|NZ_KI535633.1|:3529392-3530162',
+        'gi|560534311|ref|NZ_AYSF01000111.1|:26758-29049',
+        'gi|564165687|gb|AYLX01000355.1|:10906-11166',
+        'gi|564169776|gb|AYLX01000156.1|:1-185',
+        'gi|564938696|gb|AWYH01000018.1|:c75674-75039', 'gi|67993724|ref|XM_664440.1|',
+        'gi|68059117|ref|XM_666447.1|', 'gi|68062389|ref|XM_668109.1|',
+        'gi|71730848|gb|AAAM03000019.1|:c14289-12877', 'gi|82753723|ref|XM_722699.1|',
+        'gi|82775382|ref|NC_007606.1|:2249487-2250014', 'gi|82793634|ref|XM_723027.1|'
+        ])
+
+tax_units = "kpcofgst"
+
+if float(sys.version_info[0]) < 3.0:
+    def read_and_split( ofn  ):
+        return (l.strip().split('\t') for l in ofn)
+    def read_and_split_line( line ):
+        return line.strip().split('\t')
+else:
+    def read_and_split( ofn ):
+        return (str(l,encoding='utf-8').strip().split('\t') for l in ofn)
+    def read_and_split_line( line ):
+        return str(line,encoding='utf-8').strip().split('\t')
+
+
+def plain_read_and_split( ofn ):
+    return (l.strip().split('\t') for l in ofn)
+
+def plain_read_and_split_line( l ):
+    return l.strip().split('\t')
+
+
+
+if float(sys.version_info[0]) < 3.0:
+    def mybytes( val ):
+        return val
+else:
+    def mybytes( val ):
+        return bytes(val,encoding='utf-8')
+    
+# get the directory that contains this script
+metaphlan2_script_install_folder=os.path.dirname(os.path.abspath(__file__))
+
+def read_params(args):
+    p = ap.ArgumentParser( description= 
+            "DESCRIPTION\n"
+            " MetaPhlAn version "+__version__+" ("+__date__+"): \n"
+            " METAgenomic PHyLogenetic ANalysis for metagenomic taxonomic profiling.\n\n"
+            "AUTHORS: "+__author__+"\n\n"
+            "COMMON COMMANDS\n\n"
+            " We assume here that metaphlan2.py is in the system path and that mpa_dir bash variable contains the\n"
+            " main MetaPhlAn folder. Also BowTie2 should be in the system path with execution and read\n"
+            " permissions, and Perl should be installed)\n\n"
+           
+            "\n========== MetaPhlAn 2 clade-abundance estimation ================= \n\n"
+            "The basic usage of MetaPhlAn 2 consists in the identification of the clades (from phyla to species and \n"
+            "strains in particular cases) present in the metagenome obtained from a microbiome sample and their \n"
+            "relative abundance. This correspond to the default analysis type (--analysis_type rel_ab).\n\n"
+
+            "*  Profiling a metagenome from raw reads:\n"
+            "$ metaphlan2.py metagenome.fastq --input_type fastq\n\n"
+            
+            "*  You can take advantage of multiple CPUs and save the intermediate BowTie2 output for re-running\n"
+            "   MetaPhlAn extremely quickly:\n"
+            "$ metaphlan2.py metagenome.fastq --bowtie2out metagenome.bowtie2.bz2 --nproc 5 --input_type fastq\n\n"
+            
+            "*  If you already mapped your metagenome against the marker DB (using a previous MetaPhlAn run), you\n"
+            "   can obtain the results in few seconds by using the previously saved --bowtie2out file and \n"
+            "   specifying the input (--input_type bowtie2out):\n"
+            "$ metaphlan2.py metagenome.bowtie2.bz2 --nproc 5 --input_type bowtie2out\n\n"
+            
+            "*  You can also provide an externally BowTie2-mapped SAM if you specify this format with \n"
+            "   --input_type. Two steps: first apply BowTie2 and then feed MetaPhlAn2 with the obtained sam:\n"
+            "$ bowtie2 --sam-no-hd --sam-no-sq --no-unal --very-sensitive -S metagenome.sam -x ${mpa_dir}/db_v20/mpa_v20_m200 -U metagenome.fastq\n"
+            "$ metaphlan2.py metagenome.sam --input_type sam > profiled_metagenome.txt\n\n"
+            
+            "*  Multiple alternative ways to pass the input are also available:\n"
+            "$ cat metagenome.fastq | metaphlan2.py --input_type fastq \n"
+            "$ tar xjf metagenome.tar.bz2 --to-stdout | metaphlan2.py --input_type fastq \n"
+            "$ metaphlan2.py --input_type fastq < metagenome.fastq\n"
+            "$ metaphlan2.py --input_type fastq <(bzcat metagenome.fastq.bz2)\n"
+            "$ metaphlan2.py --input_type fastq <(zcat metagenome_1.fastq.gz metagenome_2.fastq.gz)\n\n"
+
+            "*  We can also natively handle paired-end metagenomes, and, more generally, metagenomes stored in \n"
+            "  multiple files (but you need to specify the --bowtie2out parameter):\n"
+            "$ metaphlan2.py metagenome_1.fastq,metagenome_2.fastq --bowtie2out metagenome.bowtie2.bz2 --nproc 5 --input_type fastq\n\n"
+            "\n------------------------------------------------------------------- \n \n\n"
+        
+            
+            "\n========== MetaPhlAn 2 strain tracking ============================ \n\n"
+            "MetaPhlAn 2 introduces the capability of charachterizing organisms at the strain level using non\n"
+            "aggregated marker information. Such capability comes with several slightly different flavours and \n"
+            "are a way to perform strain tracking and comparison across multiple samples.\n"
+            "Usually, MetaPhlAn 2 is first ran with the default --analysis_type to profile the species present in\n"
+            "the community, and then a strain-level profiling can be performed to zoom-in into specific species\n"
+            "of interest. This operation can be performed quickly as it exploits the --bowtie2out intermediate \n"
+            "file saved during the execution of the default analysis type.\n\n"
+           
+            "*  The following command will output the abundance of each marker with a RPK (reads per kil-base) \n"
+            "   higher 0.0. (we are assuming that metagenome_outfmt.bz2 has been generated before as \n"
+            "   shown above).\n"
+            "$ metaphlan2.py -t marker_ab_table metagenome_outfmt.bz2 --input_type bowtie2out > marker_abundance_table.txt\n"
+            "   The obtained RPK can be optionally normalized by the total number of reads in the metagenome \n"
+            "   to guarantee fair comparisons of abundances across samples. The number of reads in the metagenome\n"
+            "   needs to be passed with the '--nreads' argument\n\n"
+
+            "*  The list of markers present in the sample can be obtained with '-t marker_pres_table'\n"
+            "$ metaphlan2.py -t marker_pres_table metagenome_outfmt.bz2 --input_type bowtie2out > marker_abundance_table.txt\n"
+            "   The --pres_th argument (default 1.0) set the minimum RPK value to consider a marker present\n\n"
+            
+            "*  The list '-t clade_profiles' analysis type reports the same information of '-t marker_ab_table'\n"
+            "   but the markers are reported on a clade-by-clade basis.\n"
+            "$ metaphlan2.py -t clade_profiles metagenome_outfmt.bz2 --input_type bowtie2out > marker_abundance_table.txt\n\n"
+            
+            "*  Finally, to obtain all markers present for a specific clade and all its subclades, the \n"
+            "   '-t clade_specific_strain_tracker' should be used. For example, the following command\n"
+            "   is reporting the presence/absence of the markers for the B. fragulis species and its strains\n"
+            "   the optional argument --min_ab specifies the minimum clade abundance for reporting the markers\n\n"
+            "$ metaphlan2.py -t clade_specific_strain_tracker --clade s__Bacteroides_fragilis metagenome_outfmt.bz2 --input_type bowtie2out > marker_abundance_table.txt\n"
+            
+            "\n------------------------------------------------------------------- \n\n"
+            "",
+            formatter_class=ap.RawTextHelpFormatter,
+            add_help=False )
+    arg = p.add_argument
+
+    arg( 'inp', metavar='INPUT_FILE', type=str, nargs='?', default=None, help= 
+         "the input file can be:\n"
+         "* a fastq file containing metagenomic reads\n"
+         "OR\n"
+         "* a BowTie2 produced SAM file. \n"
+         "OR\n"
+         "* an intermediary mapping file of the metagenome generated by a previous MetaPhlAn run \n"
+         "If the input file is missing, the script assumes that the input is provided using the standard \n"
+         "input, or named pipes.\n"
+         "IMPORTANT: the type of input needs to be specified with --input_type" )   
+    
+    arg( 'output', metavar='OUTPUT_FILE', type=str, nargs='?', default=None,
+         help= "the tab-separated output file of the predicted taxon relative abundances \n"
+               "[stdout if not present]")
+
+
+    g = p.add_argument_group('Required arguments')
+    arg = g.add_argument
+    input_type_choices = ['fastq','fasta','multifasta','multifastq','bowtie2out','sam'] # !!!!
+    arg( '--input_type', choices=input_type_choices, required = 'True', help =  
+         "set wheter the input is the multifasta file of metagenomic reads or \n"
+         "the SAM file of the mapping of the reads against the MetaPhlAn db.\n"
+         "[default 'automatic', i.e. the script will try to guess the input format]\n" )
+   
+    g = p.add_argument_group('Mapping arguments')
+    arg = g.add_argument
+    arg( '--mpa_pkl', type=str,
+         default=os.path.join(metaphlan2_script_install_folder,"db_v20","mpa_v20_m200.pkl"), 
+         help = "the metadata pickled MetaPhlAn file")
+    arg( '--bowtie2db', metavar="METAPHLAN_BOWTIE2_DB", type=str,
+         default = os.path.join(metaphlan2_script_install_folder,"db_v20","mpa_v20_m200"),
+         help = "The BowTie2 database file of the MetaPhlAn database. \n"
+                "Used if --input_type is fastq, fasta, multifasta, or multifastq")
+    bt2ps = ['sensitive','very-sensitive','sensitive-local','very-sensitive-local']
+    arg( '--bt2_ps', metavar="BowTie2 presets", default='very-sensitive', choices=bt2ps,
+         help = "presets options for BowTie2 (applied only when a multifasta file is provided)\n"
+                "The choices enabled in MetaPhlAn are:\n"
+                " * sensitive\n"
+                " * very-sensitive\n"
+                " * sensitive-local\n"
+                " * very-sensitive-local\n"
+                "[default very-sensitive]\n"   )
+    arg( '--bowtie2_exe', type=str, default = None, help =
+         'Full path and name of the BowTie2 executable. This option allows \n'
+         'MetaPhlAn to reach the executable even when it is not in the system \n'
+         'PATH or the system PATH is unreachable\n' )
+    arg( '--bowtie2out', metavar="FILE_NAME", type=str, default = None, help = 
+         "The file for saving the output of BowTie2\n" )
+    arg( '--no_map', action='store_true', help=
+         "Avoid storing the --bowtie2out map file\n" )
+    arg( '--tmp_dir', metavar="", default=None, type=str, help = 
+         "the folder used to store temporary files \n"
+         "[default is the OS dependent tmp dir]\n"   )
+    
+    
+    g = p.add_argument_group('Post-mapping arguments')
+    arg = g.add_argument
+    stat_choices = ['avg_g','avg_l','tavg_g','tavg_l','wavg_g','wavg_l','med']
+    arg( '--tax_lev', metavar='TAXONOMIC_LEVEL', type=str, 
+         choices='a'+tax_units, default='a', help = 
+         "The taxonomic level for the relative abundance output:\n"
+         "'a' : all taxonomic levels\n"
+         "'k' : kingdoms\n"
+         "'p' : phyla only\n"
+         "'c' : classes only\n"
+         "'o' : orders only\n"
+         "'f' : families only\n"
+         "'g' : genera only\n"
+         "'s' : species only\n"
+         "[default 'a']" )
+    arg( '--min_cu_len', metavar="", default="2000", type=int, help =
+         "minimum total nucleotide length for the markers in a clade for\n"
+         "estimating the abundance without considering sub-clade abundances\n"
+         "[default 2000]\n"   )
+    arg( '--min_alignment_len', metavar="", default=None, type=int, help =
+         "The sam records for aligned reads with the longest subalignment\n"
+         "length smaller than this threshold will be discarded.\n"
+         "[default None]\n"   )
+    arg( '--ignore_viruses', action='store_true', help=
+         "Do not profile viral organisms" )
+    arg( '--ignore_eukaryotes', action='store_true', help=
+         "Do not profile eukaryotic organisms" )
+    arg( '--ignore_bacteria', action='store_true', help=
+         "Do not profile bacterial organisms" )
+    arg( '--ignore_archaea', action='store_true', help=
+         "Do not profile archeal organisms" )
+    arg( '--stat_q', metavar="", type = float, default=0.1, help = 
+         "Quantile value for the robust average\n"
+         "[default 0.1]"   )
+    arg( '--ignore_markers', type=str, default = None, help = 
+         "File containing a list of markers to ignore. \n")
+    arg( '--avoid_disqm', action="store_true", help = 
+         "Deactivate the procedure of disambiguating the quasi-markers based on the \n"
+         "marker abundance pattern found in the sample. It is generally recommended \n"
+         "too keep the disambiguation procedure in order to minimize false positives\n")
+    arg( '--stat', metavar="", choices=stat_choices, default="tavg_g", type=str, help = 
+         "EXPERIMENTAL! Statistical approach for converting marker abundances into clade abundances\n"
+         "'avg_g'  : clade global (i.e. normalizing all markers together) average\n"
+         "'avg_l'  : average of length-normalized marker counts\n"
+         "'tavg_g' : truncated clade global average at --stat_q quantile\n"
+         "'tavg_l' : trunated average of length-normalized marker counts (at --stat_q)\n"
+         "'wavg_g' : winsorized clade global average (at --stat_q)\n"
+         "'wavg_l' : winsorized average of length-normalized marker counts (at --stat_q)\n"
+         "'med'    : median of length-normalized marker counts\n"
+         "[default tavg_g]"   ) 
+    
+    arg = p.add_argument
+
+
+    
+    g = p.add_argument_group('Additional analysis types and arguments')
+    arg = g.add_argument
+    analysis_types = ['rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts', 'marker_pres_table', 'clade_specific_strain_tracker']
+    arg( '-t', metavar='ANALYSIS TYPE', type=str, choices = analysis_types, 
+         default='rel_ab', help = 
+         "Type of analysis to perform: \n"
+         " * rel_ab: profiling a metagenomes in terms of relative abundances\n"
+         " * rel_ab_w_read_stats: profiling a metagenomes in terms of relative abundances and estimate the number of reads comming from each clade.\n"
+         " * reads_map: mapping from reads to clades (only reads hitting a marker)\n"
+         " * clade_profiles: normalized marker counts for clades with at least a non-null marker\n"
+         " * marker_ab_table: normalized marker counts (only when > 0.0 and normalized by metagenome size if --nreads is specified)\n"
+         " * marker_counts: non-normalized marker counts [use with extreme caution]\n"
+         " * marker_pres_table: list of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th\n"
+         "[default 'rel_ab']" )
+    arg( '--nreads', metavar="NUMBER_OF_READS", type=int, default = None, help =
+         "The total number of reads in the original metagenome. It is used only when \n"
+         "-t marker_table is specified for normalizing the length-normalized counts \n"
+         "with the metagenome size as well. No normalization applied if --nreads is not \n"
+         "specified" )
+    arg( '--pres_th', metavar="PRESENCE_THRESHOLD", type=int, default = 1.0, help =
+         'Threshold for calling a marker present by the -t marker_pres_table option' )
+    arg( '--clade', metavar="", default=None, type=str, help = 
+         "The clade for clade_specific_strain_tracker analysis\n"  )
+    arg( '--min_ab', metavar="", default=0.1, type=float, help = 
+         "The minimum percentage abundace for the clade in the clade_specific_strain_tracker analysis\n"  )
+    arg( "-h", "--help", action="help", help="show this help message and exit")
+
+    g = p.add_argument_group('Output arguments')
+    arg = g.add_argument
+    arg( '-o', '--output_file',  metavar="output file", type=str, default=None, help = 
+         "The output file (if not specified as positional argument)\n")
+    arg('--sample_id_key',  metavar="name", type=str, default="#SampleID", 
+        help =("Specify the sample ID key for this analysis."
+               " Defaults to '#SampleID'."))
+    arg('--sample_id',  metavar="value", type=str, 
+        default="Metaphlan2_Analysis",
+        help =("Specify the sample ID for this analysis."
+               " Defaults to 'Metaphlan2_Analysis'."))
+    arg( '-s', '--samout', metavar="sam_output_file",
+        type=str, default=None, help="The sam output file\n")
+    #*************************************************************
+    #* Parameters related to biom file generation                *
+    #*************************************************************         
+    arg( '--biom', '--biom_output_file',  metavar="biom_output", type=str, default=None, help = 
+         "If requesting biom file output: The name of the output file in biom format \n")
+
+    arg( '--mdelim', '--metadata_delimiter_char',  metavar="mdelim", type=str, default="|", help = 
+         "Delimiter for bug metadata: - defaults to pipe. e.g. the pipe in k__Bacteria|p__Proteobacteria \n")
+    #*************************************************************
+    #* End parameters related to biom file generation            *
+    #*************************************************************    
+    
+    g = p.add_argument_group('Other arguments')
+    arg = g.add_argument
+    arg( '--nproc', metavar="N", type=int, default=1, help = 
+         "The number of CPUs to use for parallelizing the mapping\n"
+         "[default 1, i.e. no parallelism]\n" ) 
+    arg( '-v','--version', action='version', version="MetaPhlAn version "+__version__+"\t("+__date__+")",
+         help="Prints the current MetaPhlAn version and exit\n" )
+    
+
+    return vars(p.parse_args()) 
+
+def run_bowtie2(  fna_in, outfmt6_out, bowtie2_db, preset, nproc, 
+                  file_format = "multifasta", exe = None, 
+                  samout = None,
+                  min_alignment_len = None,
+                  ):
+    try:
+        if not fna_in: # or stat.S_ISFIFO(os.stat(fna_in).st_mode):
+            fna_in = "-"
+        bowtie2_cmd = [ exe if exe else 'bowtie2', 
+                        "--quiet", "--no-unal", 
+                        "--"+preset,
+                        "-S","-",
+                        "-x", bowtie2_db,
+                         ] + ([] if int(nproc) < 2 else ["-p",str(nproc)])
+        bowtie2_cmd += ["-U", fna_in] # if not stat.S_ISFIFO(os.stat(fna_in).st_mode) else []
+        bowtie2_cmd += (["-f"] if file_format == "multifasta" else []) 
+        p = subp.Popen( bowtie2_cmd, stdout=subp.PIPE ) 
+        lmybytes, outf = (mybytes,bz2.BZ2File(outfmt6_out, "w")) if outfmt6_out.endswith(".bz2") else (str,open( outfmt6_out, "w" ))
+        
+        try:
+            if samout:
+                if samout[-4:] == '.bz2':
+                    sam_file = bz2.BZ2File(samout, 'w')
+                else:
+                    sam_file = open(samout, 'w')
+        except IOError:
+            sys.stderr.write( "IOError: Unable to open sam output file.\n" )
+            sys.exit(1)
+
+        for line in p.stdout:
+            if samout:
+                sam_file.write(line)
+            if line[0] != '@':
+                o = read_and_split_line(line)
+                if o[2][-1] != '*':
+                    if min_alignment_len == None\
+                        or max([int(x.strip('M')) for x in\
+                                re.findall(r'(\d*M)', o[5])]) >= min_alignment_len:
+                        outf.write( lmybytes("\t".join([o[0],o[2]]) +"\n") )
+        #if  float(sys.version_info[0]) >= 3: 
+        #    for o in read_and_split(p.stdout):
+        #        if o[2][-1] != '*':
+        #            outf.write( bytes("\t".join([o[0],o[2]]) +"\n",encoding='utf-8') )
+        #else:
+        #    for o in read_and_split(p.stdout):
+        #        if o[2][-1] != '*':
+        #            outf.write( "\t".join([o[0],o[2]]) +"\n" )
+        outf.close()
+        if samout:
+            sam_file.close()
+        p.wait()
+
+
+    except OSError:
+        sys.stderr.write( "OSError: fatal error running BowTie2. Is BowTie2 in the system path?\n" )
+        sys.exit(1)
+    except ValueError:
+        sys.stderr.write( "ValueError: fatal error running BowTie2.\n" )
+        sys.exit(1)
+    except IOError:
+        sys.stderr.write( "IOError: fatal error running BowTie2.\n" )
+        sys.exit(1)
+    if p.returncode == 13:
+        sys.stderr.write( "Permission Denied Error: fatal error running BowTie2." 
+          "Is the BowTie2 file in the path with execution and read permissions?\n" )
+        sys.exit(1)
+    elif p.returncode != 0:
+        sys.stderr.write("Error while running bowtie2.\n")
+        sys.exit(1)
+
+#def guess_input_format( inp_file ):
+#    if "," in inp_file:
+#        sys.stderr.write( "Sorry, I cannot guess the format of the input, when "
+#                          "more than one file is specified. Please set the --input_type parameter \n" )
+#        sys.exit(1) 
+#
+#    with open( inp_file ) as inpf:
+#        for i,l in enumerate(inpf):
+#            line = l.strip()
+#            if line[0] == '#': continue
+#            if line[0] == '>': return 'multifasta'
+#            if line[0] == '@': return 'multifastq'
+#            if len(l.split('\t')) == 2: return 'bowtie2out'
+#            if i > 20: break
+#    return None
+
+class TaxClade:
+    min_cu_len = -1
+    markers2lens = None
+    stat = None
+    quantile = None
+    avoid_disqm = False
+
+    def __init__( self, name, uncl = False, id_int = 0 ):
+        self.children, self.markers2nreads = {}, {}
+        self.name, self.father = name, None
+        self.uncl, self.subcl_uncl = uncl, False
+        self.abundance, self.uncl_abundance = None, 0 
+        self.id = id_int
+
+    def add_child( self, name, id_int ):
+        new_clade = TaxClade( name, id_int=id_int )
+        self.children[name] = new_clade
+        new_clade.father = self
+        return new_clade
+
+    
+    def get_terminals( self ):
+        terms = []
+        if not self.children:
+            return [self]
+        for c in self.children.values():
+            terms += c.get_terminals()
+        return terms
+
+
+    def get_full_name( self ):
+        fullname = [self.name]
+        cl = self.father
+        while cl:
+            fullname = [cl.name] + fullname
+            cl = cl.father
+        return "|".join(fullname[1:])
+
+    def get_normalized_counts( self ):
+        return [(m,float(n)*1000.0/self.markers2lens[m]) 
+                    for m,n in self.markers2nreads.items()]
+
+    def compute_abundance( self ):
+        if self.abundance is not None: return self.abundance
+        sum_ab = sum([c.compute_abundance() for c in self.children.values()]) 
+        rat_nreads = sorted([(self.markers2lens[m],n) 
+                                    for m,n in self.markers2nreads.items()],
+                                            key = lambda x: x[1])
+
+        rat_nreads, removed = [], []
+        for m,n in self.markers2nreads.items():
+            misidentified = False
+
+            if not self.avoid_disqm:
+                for e in self.markers2exts[m]:
+                    toclade = self.taxa2clades[e]
+                    m2nr = toclade.markers2nreads
+                    tocladetmp = toclade
+                    while len(tocladetmp.children) == 1:
+                        tocladetmp = list(tocladetmp.children.values())[0]
+                        m2nr = tocladetmp.markers2nreads
+    
+                    nonzeros = sum([v>0 for v in m2nr.values()])
+                    if len(m2nr):
+                        if float(nonzeros) / len(m2nr) > 0.33:
+                            misidentified = True
+                            removed.append( (self.markers2lens[m],n) )
+                            break
+            if not misidentified:
+                rat_nreads.append( (self.markers2lens[m],n) ) 
+       
+        if not self.avoid_disqm and len(removed):
+            n_rat_nreads = float(len(rat_nreads))
+            n_removed = float(len(removed))
+            n_tot = n_rat_nreads + n_removed
+            n_ripr = 10
+            
+            if len(self.get_terminals()) < 2:
+                n_ripr = 0
+
+            if "k__Viruses" in self.get_full_name():
+                n_ripr = 0
+
+            if n_rat_nreads < n_ripr and n_tot > n_rat_nreads:
+                rat_nreads += removed[:n_ripr-int(n_rat_nreads)]
+
+        
+        rat_nreads = sorted(rat_nreads, key = lambda x: x[1])
+
+        rat_v,nreads_v = zip(*rat_nreads) if rat_nreads else ([],[])
+        rat, nrawreads, loc_ab = float(sum(rat_v)) or -1.0, sum(nreads_v), 0.0
+        quant = int(self.quantile*len(rat_nreads))
+        ql,qr,qn = (quant,-quant,quant) if quant else (None,None,0)
+     
+        if self.name[0] == 't' and (len(self.father.children) > 1 or "_sp" in self.father.name or "k__Viruses" in self.get_full_name()):
+            non_zeros = float(len([n for r,n in rat_nreads if n > 0])) 
+            nreads = float(len(rat_nreads))
+            if nreads == 0.0 or non_zeros / nreads < 0.7:
+                self.abundance = 0.0
+                return 0.0
+
+        if rat < 0.0:
+            pass
+        elif self.stat == 'avg_g' or (not qn and self.stat in ['wavg_g','tavg_g']):
+            loc_ab = nrawreads / rat if rat >= 0 else 0.0
+        elif self.stat == 'avg_l' or (not qn and self.stat in ['wavg_l','tavg_l']):
+            loc_ab = np.mean([float(n)/r for r,n in rat_nreads]) 
+        elif self.stat == 'tavg_g':
+            wnreads = sorted([(float(n)/r,r,n) for r,n in rat_nreads], key=lambda x:x[0])
+            den,num = zip(*[v[1:] for v in wnreads[ql:qr]])
+            loc_ab = float(sum(num))/float(sum(den)) if any(den) else 0.0
+        elif self.stat == 'tavg_l':
+            loc_ab = np.mean(sorted([float(n)/r for r,n in rat_nreads])[ql:qr])
+        elif self.stat == 'wavg_g':
+            vmin, vmax = nreads_v[ql], nreads_v[qr]
+            wnreads = [vmin]*qn+list(nreads_v[ql:qr])+[vmax]*qn
+            loc_ab = float(sum(wnreads)) / rat  
+        elif self.stat == 'wavg_l':
+            wnreads = sorted([float(n)/r for r,n in rat_nreads])
+            vmin, vmax = wnreads[ql], wnreads[qr]
+            wnreads = [vmin]*qn+list(wnreads[ql:qr])+[vmax]*qn
+            loc_ab = np.mean(wnreads) 
+        elif self.stat == 'med':
+            loc_ab = np.median(sorted([float(n)/r for r,n in rat_nreads])[ql:qr]) 
+        
+        self.abundance = loc_ab
+        if rat < self.min_cu_len and self.children:
+            self.abundance = sum_ab
+        elif loc_ab < sum_ab:
+            self.abundance = sum_ab
+
+        if self.abundance > sum_ab and self.children: # *1.1??
+            self.uncl_abundance = self.abundance - sum_ab
+        self.subcl_uncl = not self.children and self.name[0] not in tax_units[-2:] 
+
+        return self.abundance
+
+    def get_all_abundances( self ):
+        ret = [(self.name,self.abundance)]
+        if self.uncl_abundance > 0.0:
+            lchild = list(self.children.values())[0].name[:3]
+            ret += [(lchild+self.name[3:]+"_unclassified",self.uncl_abundance)]
+        if self.subcl_uncl and self.name[0] != tax_units[-2]:
+            cind = tax_units.index( self.name[0] )
+            ret += [(   tax_units[cind+1]+self.name[1:]+"_unclassified",
+                        self.abundance)]
+        for c in self.children.values():
+            ret += c.get_all_abundances()
+        return ret
+
+
+class TaxTree:
+    def __init__( self, mpa, markers_to_ignore = None ): #, min_cu_len ):
+        self.root = TaxClade( "root" )
+        self.all_clades, self.markers2lens, self.markers2clades, self.taxa2clades, self.markers2exts = {}, {}, {}, {}, {}
+        TaxClade.markers2lens = self.markers2lens
+        TaxClade.markers2exts = self.markers2exts
+        TaxClade.taxa2clades = self.taxa2clades
+        self.id_gen = itertools.count(1)
+
+        clades_txt = ((l.strip().split("|"),n) for l,n in mpa_pkl['taxonomy'].items())        
+        for clade,lenc in clades_txt:
+            father = self.root
+            for clade_lev in clade: # !!!!! [:-1]:
+                if not clade_lev in father.children:
+                    father.add_child( clade_lev, id_int=self.id_gen.next() )
+                    self.all_clades[clade_lev] = father.children[clade_lev]
+                if clade_lev[0] == "t":
+                    self.taxa2clades[clade_lev[3:]] = father
+
+                father = father.children[clade_lev]
+                if clade_lev[0] == "t":
+                    father.glen = lenc
+
+        def add_lens( node ):
+            if not node.children:
+                return node.glen
+            lens = [] 
+            for c in node.children.values():
+                lens.append( add_lens( c ) )
+            node.glen = sum(lens) / len(lens)
+            return node.glen
+        add_lens( self.root )
+        
+        for k,p in mpa_pkl['markers'].items():
+            if k in markers_to_exclude:
+                continue
+            if k in markers_to_ignore:
+                continue
+            self.markers2lens[k] = p['len']
+            self.markers2clades[k] = p['clade']
+            self.add_reads( k, 0  )
+            self.markers2exts[k] = p['ext']
+
+    def set_min_cu_len( self, min_cu_len ):
+        TaxClade.min_cu_len = min_cu_len
+
+    def set_stat( self, stat, quantile, avoid_disqm = False ):
+        TaxClade.stat = stat
+        TaxClade.quantile = quantile
+        TaxClade.avoid_disqm = avoid_disqm
+
+    def add_reads(  self, marker, n, 
+                    ignore_viruses = False, ignore_eukaryotes = False, 
+                    ignore_bacteria = False, ignore_archaea = False  ):
+        clade = self.markers2clades[marker]
+        cl = self.all_clades[clade]
+        if ignore_viruses or ignore_eukaryotes or ignore_bacteria or ignore_archaea:
+            cn = cl.get_full_name()
+            if ignore_viruses and cn.startswith("k__Viruses"):
+                return ""
+            if ignore_eukaryotes and cn.startswith("k__Eukaryota"):
+                return ""
+            if ignore_archaea and cn.startswith("k__Archaea"):
+                return ""
+            if ignore_bacteria and cn.startswith("k__Bacteria"):
+                return ""
+        while len(cl.children) == 1:
+            cl = list(cl.children.values())[0]
+        cl.markers2nreads[marker] = n
+        return cl.get_full_name()
+   
+
+    def markers2counts( self ):
+        m2c = {}
+        for k,v in self.all_clades.items():
+            for m,c in v.markers2nreads.items():
+                m2c[m] = c
+        return m2c
+
+    def clade_profiles( self, tax_lev, get_all = False  ):
+        cl2pr = {}
+        for k,v in self.all_clades.items():
+            if tax_lev and not k.startswith(tax_lev): 
+                continue
+            prof = v.get_normalized_counts()
+            if not get_all and ( len(prof) < 1 or not sum([p[1] for p in prof]) > 0.0 ):
+                continue
+            cl2pr[v.get_full_name()] = prof
+        return cl2pr
+            
+    def relative_abundances( self, tax_lev  ):
+        cl2ab_n = dict([(k,v) for k,v in self.all_clades.items() 
+                    if k.startswith("k__") and not v.uncl])
+     
+        cl2ab, cl2glen, tot_ab = {}, {}, 0.0 
+        for k,v in cl2ab_n.items():
+            tot_ab += v.compute_abundance()
+
+        for k,v in cl2ab_n.items():
+            for cl,ab in v.get_all_abundances():
+                if not tax_lev:
+                    if cl not in self.all_clades:
+                        to = tax_units.index(cl[0])
+                        t = tax_units[to-1]
+                        cl = t + cl.split("_unclassified")[0][1:]
+                        cl = self.all_clades[cl].get_full_name()
+                        spl = cl.split("|")
+                        cl = "|".join(spl+[tax_units[to]+spl[-1][1:]+"_unclassified"])
+                        glen = self.all_clades[spl[-1]].glen
+                    else:
+                        glen = self.all_clades[cl].glen
+                        cl = self.all_clades[cl].get_full_name() 
+                elif not cl.startswith(tax_lev):
+                    if cl in self.all_clades:
+                        glen = self.all_clades[cl].glen
+                    else:
+                        glen = 1.0
+                    continue
+                cl2ab[cl] = ab
+                cl2glen[cl] = glen 
+
+        ret_d = dict([( k, float(v) / tot_ab if tot_ab else 0.0) for k,v in cl2ab.items()])
+        ret_r = dict([( k, (v,cl2glen[k],float(v)*cl2glen[k])) for k,v in cl2ab.items()])
+        #ret_r = dict([( k, float(v) / tot_ab if tot_ab else 0.0) for k,v in cl2ab.items()])
+        if tax_lev:
+            ret_d[tax_lev+"unclassified"] = 1.0 - sum(ret_d.values())
+        return ret_d, ret_r
+
+def map2bbh( mapping_f, input_type = 'bowtie2out', min_alignment_len = None):
+    if not mapping_f:
+        ras, ras_line, inpf = plain_read_and_split, plain_read_and_split_line, sys.stdin
+    else:
+        if mapping_f.endswith(".bz2"):
+            ras, ras_line, inpf = read_and_split, read_and_split_line, bz2.BZ2File( mapping_f, "r" )
+        else:
+            ras, ras_line, inpf = plain_read_and_split,\
+                                  plain_read_and_split_line,\
+                                  open( mapping_f )
+
+    reads2markers, reads2maxb = {}, {}
+    if input_type == 'bowtie2out':
+        for r,c in ras(inpf):
+            reads2markers[r] = c
+    elif input_type == 'sam':
+        for line in inpf:
+            o = ras_line(line)
+            if o[0][0] != '@' and o[2][-1] != '*':
+                if min_alignment_len == None\
+                    or max([int(x.strip('M')) for x in\
+                            re.findall(r'(\d*M)', o[5])]) >= min_alignment_len:
+                    reads2markers[o[0]] = o[2]
+    inpf.close()
+
+    markers2reads = defdict( set )
+    for r,m in reads2markers.items():
+        markers2reads[m].add( r )
+
+    return markers2reads
+    
+    
+def maybe_generate_biom_file(pars, abundance_predictions):
+    if not pars['biom']:
+        return None
+    if not abundance_predictions:
+        return open(pars['biom'], 'w').close()
+
+    delimiter = "|" if len(pars['mdelim']) > 1 else pars['mdelim']
+    def istip(clade_name):
+        end_name = clade_name.split(delimiter)[-1]
+        return end_name.startswith("t__") or end_name.endswith("_unclassified")
+
+    def findclade(clade_name):
+        if clade_name.endswith('_unclassified'):
+            name = clade_name.split(delimiter)[-2]
+        else:
+            name = clade_name.split(delimiter)[-1]
+        return tree.all_clades[name]
+
+    def to_biomformat(clade_name):
+        return { 'taxonomy': clade_name.split(delimiter) }
+
+    clades = iter( (abundance, findclade(name)) 
+                   for (name, abundance) in abundance_predictions
+                   if istip(name) )
+    packed = iter( ([abundance], clade.get_full_name(), clade.id)
+                   for (abundance, clade) in clades )
+
+    #unpack that tuple here to stay under 80 chars on a line
+    data, clade_names, clade_ids = zip(*packed)
+    # biom likes column vectors, so we give it an array like this:
+    # np.array([a],[b],[c])
+    data = np.array(data)
+    sample_ids = [pars['sample_id']]
+    table_id='MetaPhlAn2_Analysis'
+    json_key = "MetaPhlAn2"
+
+    if LooseVersion(biom.__version__) < LooseVersion("2.0.0"):
+        biom_table = biom.table.table_factory(
+            data, sample_ids, clade_ids,
+            sample_metadata      = None,
+            observation_metadata = map(to_biomformat, clade_names),
+            table_id             = table_id,
+            constructor          = biom.table.DenseOTUTable
+        )
+        with open(pars['biom'], 'w') as outfile:
+            json.dump( biom_table.getBiomFormatObject(json_key),
+                           outfile )
+    else:  # Below is the biom2 compatible code
+        biom_table = biom.table.Table(
+            data, clade_ids, sample_ids,
+            sample_metadata      = None,
+            observation_metadata = map(to_biomformat, clade_names),
+            table_id             = table_id,
+            input_is_dense       = True
+        )
+        
+        with open(pars['biom'], 'w') as outfile:  
+            biom_table.to_json( json_key,
+                                direct_io = outfile )
+
+    return True
+
+
+if __name__ == '__main__':
+    pars = read_params( sys.argv )    
+    #if pars['inp'] is None and ( pars['input_type'] is None or  pars['input_type'] == 'automatic'): 
+    #    sys.stderr.write( "The --input_type parameter need top be specified when the "
+    #                      "input is provided from the standard input.\n"
+    #                      "Type metaphlan.py -h for more info\n")
+    #    sys.exit(0)
+
+    if pars['bt2_ps'] in [
+                          "sensitive-local",
+                          "very-sensitive-local"
+                          ]\
+        and pars['min_alignment_len'] == None:
+            pars['min_alignment_len'] = 100
+            sys.stderr.write('Warning! bt2_ps is set to local mode, '\
+                             'and min_alignment_len is None, '
+                             'I automatically set min_alignment_len to 100! '\
+                             'If you do not like, rerun the command and set '\
+                             'min_alignment_len to a specific value.\n'
+                            )
+
+    if pars['input_type'] == 'fastq':
+        pars['input_type'] = 'multifastq'
+    if pars['input_type'] == 'fasta':
+        pars['input_type'] = 'multifasta'
+
+    #if pars['input_type'] == 'automatic':
+    #    pars['input_type'] = guess_input_format( pars['inp'] )
+    #    if not pars['input_type']:
+    #        sys.stderr.write( "Sorry, I cannot guess the format of the input file, please "
+    #                          "specify the --input_type parameter \n" )
+    #        sys.exit(1) 
+
+    # check for the mpa_pkl file
+    if not os.path.isfile(pars['mpa_pkl']):
+        sys.stderr.write("Error: Unable to find the mpa_pkl file at: " + pars['mpa_pkl'] +
+                         "\nExpecting location ${mpa_dir}/db_v20/map_v20_m200.pkl "
+                         "\nSelect the file location with the option --mpa_pkl.\n"
+                         "Exiting...\n\n")
+        sys.exit(1)           
+
+    if pars['ignore_markers']:
+        with open(pars['ignore_markers']) as ignv:
+            ignore_markers = set([l.strip() for l in ignv])
+    else:
+        ignore_markers = set()
+
+    no_map = False
+    if pars['input_type'] == 'multifasta' or pars['input_type'] == 'multifastq':
+        bow = pars['bowtie2db'] is not None
+        if not bow:
+            sys.stderr.write( "No MetaPhlAn BowTie2 database provided\n "
+                              "[--bowtie2db options]!\n"
+                              "Exiting...\n\n" )
+            sys.exit(1)
+        if pars['no_map']:
+            pars['bowtie2out'] = tf.NamedTemporaryFile(dir=pars['tmp_dir']).name
+            no_map = True
+        else:
+            if bow and not pars['bowtie2out']:
+                if pars['inp'] and "," in  pars['inp']:
+                    sys.stderr.write( "Error! --bowtie2out needs to be specified when multiple "
+                                      "fastq or fasta files (comma separated) are provided"  )
+                    sys.exit(1)
+                fname = pars['inp']
+                if fname is None:
+                    fname = "stdin_map"
+                elif stat.S_ISFIFO(os.stat(fname).st_mode):
+                    fname = "fifo_map"
+                pars['bowtie2out'] = fname + ".bowtie2out.txt"
+
+            if os.path.exists( pars['bowtie2out'] ):
+                sys.stderr.write(   
+                    "BowTie2 output file detected: " + pars['bowtie2out'] + "\n"
+                    "Please use it as input or remove it if you want to "
+                    "re-perform the BowTie2 run.\n"
+                    "Exiting...\n\n" )
+                sys.exit(1)
+
+        if bow and not all([os.path.exists(".".join([str(pars['bowtie2db']),p]))
+                        for p in ["1.bt2", "2.bt2", "3.bt2","4.bt2","1.bt2","2.bt2"]]):
+            sys.stderr.write( "No MetaPhlAn BowTie2 database found "
+                              "[--bowtie2db option]! "
+                              "(or wrong path provided)."
+                              "\nExpecting location ${mpa_dir}/db_v20/map_v20_m200 "
+                              "\nExiting... " )
+            sys.exit(1)
+
+        if bow:
+            run_bowtie2( pars['inp'], pars['bowtie2out'], pars['bowtie2db'], 
+                         pars['bt2_ps'], pars['nproc'], file_format = pars['input_type'],
+                         exe = pars['bowtie2_exe'],
+                         samout = pars['samout'],
+                         min_alignment_len = pars['min_alignment_len'])
+            pars['input_type'] = 'bowtie2out'
+        
+        pars['inp'] = pars['bowtie2out'] # !!!
+
+    with open( pars['mpa_pkl'], 'rb' ) as a:
+        mpa_pkl = pickle.loads( bz2.decompress( a.read() ) )
+
+    tree = TaxTree( mpa_pkl, ignore_markers )
+    tree.set_min_cu_len( pars['min_cu_len'] )
+    tree.set_stat( pars['stat'], pars['stat_q'], pars['avoid_disqm']  )
+
+    markers2reads = map2bbh( 
+                            pars['inp'], 
+                            pars['input_type'],
+                            pars['min_alignment_len']
+                            )
+    if no_map:
+        os.remove( pars['inp'] )         
+
+    map_out = []
+    for marker,reads in markers2reads.items():
+        if marker not in tree.markers2lens:
+            continue
+        tax_seq = tree.add_reads( marker, len(reads), 
+                                  ignore_viruses = pars['ignore_viruses'],
+                                  ignore_eukaryotes = pars['ignore_eukaryotes'],
+                                  ignore_bacteria = pars['ignore_bacteria'],
+                                  ignore_archaea = pars['ignore_archaea'],
+                                  )
+        if tax_seq:
+            map_out +=["\t".join([r,tax_seq]) for r in reads]
+    
+    if pars['output'] is None and pars['output_file'] is not None:
+        pars['output'] = pars['output_file']
+
+    with (open(pars['output'],"w") if pars['output'] else sys.stdout) as outf:
+        outf.write('\t'.join((pars["sample_id_key"], pars["sample_id"])) + '\n')
+        if pars['t'] == 'reads_map':
+            outf.write( "\n".join( map_out ) + "\n" )
+        elif pars['t'] == 'rel_ab':
+            cl2ab, _ = tree.relative_abundances( 
+                        pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
+            outpred = [(k,round(v*100.0,5)) for k,v in cl2ab.items() if v > 0.0]
+            if outpred:
+                for k,v in sorted(  outpred, reverse=True,
+                                    key=lambda x:x[1]+(100.0*(8-x[0].count("|")))  ): 
+                    outf.write( "\t".join( [k,str(v)] ) + "\n" )   
+            else:
+                outf.write( "unclassified\t100.0\n" )
+            maybe_generate_biom_file(pars, outpred)
+        elif pars['t'] == 'rel_ab_w_read_stats':
+            cl2ab, rr = tree.relative_abundances( 
+                        pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None )
+            outpred = [(k,round(v*100.0,5)) for k,v in cl2ab.items() if v > 0.0]
+            totl = 0
+            if outpred:
+                outf.write( "\t".join( [    "#clade_name",
+                                            "relative_abundance",
+                                            "coverage",
+                                            "average_genome_length_in_the_clade",
+                                            "estimated_number_of_reads_from_the_clade" ]) +"\n" )
+
+                for k,v in sorted(  outpred, reverse=True,
+                                    key=lambda x:x[1]+(100.0*(8-x[0].count("|")))  ): 
+                    outf.write( "\t".join( [    k,
+                                                str(v),
+                                                str(rr[k][0]) if k in rr else "-",
+                                                str(rr[k][1]) if k in rr else "-",
+                                                str(int(round(rr[k][2],0)) if k in rr else "-")   
+                                                ] ) + "\n" )   
+                    if "|" not in k:
+                        totl += (int(round(rr[k][2],0)) if k in rr else 0)
+
+                outf.write( "#estimated total number of reads from known clades: " + str(totl)+"\n")
+            else:
+                outf.write( "unclassified\t100.0\n" )
+            maybe_generate_biom_file(pars, outpred)
+
+        elif pars['t'] == 'clade_profiles':
+            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
+            for c,p in cl2pr.items():
+                mn,n = zip(*p)
+                outf.write( "\t".join( [""]+[str(s) for s in mn] ) + "\n" )
+                outf.write( "\t".join( [c]+[str(s) for s in n] ) + "\n" )
+        elif pars['t'] == 'marker_ab_table':
+            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
+            for v in cl2pr.values():
+                outf.write( "\n".join(["\t".join([str(a),str(b/float(pars['nreads'])) if pars['nreads'] else str(b)]) 
+                                for a,b in v if b > 0.0]) + "\n" )
+        elif pars['t'] == 'marker_pres_table':
+            cl2pr = tree.clade_profiles( pars['tax_lev']+"__" if pars['tax_lev'] != 'a' else None  )
+            for v in cl2pr.values():
+                strout = ["\t".join([str(a),"1"]) for a,b in v if b > pars['pres_th']]
+                if strout:
+                    outf.write( "\n".join(strout) + "\n" )
+
+        elif pars['t'] == 'marker_counts':
+            outf.write( "\n".join( ["\t".join([m,str(c)]) for m,c in tree.markers2counts().items() ]) +"\n" )
+
+        elif pars['t'] == 'clade_specific_strain_tracker':
+            cl2pr = tree.clade_profiles( None, get_all = True  )
+            cl2ab, _ = tree.relative_abundances( None )
+            strout = []
+            for cl,v in cl2pr.items():
+                if cl.endswith(pars['clade']) and cl2ab[cl]*100.0 < pars['min_ab']:
+                    strout = []
+                    break
+                if pars['clade'] in cl:
+                    strout += ["\t".join([str(a),str(int(b > pars['pres_th']))]) for a,b in v]
+            if strout:
+                strout = sorted(strout,key=lambda x:x[0])
+                outf.write( "\n".join(strout) + "\n" )
+            else:
+                sys.stderr.write("Clade "+pars['clade']+" not present at an abundance >"+str(round(pars['min_ab'],2))+"%, "
+                                 "so no clade specific markers are reported\n")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/metaphlan2.xml	Sun Apr 24 13:19:46 2016 -0400
@@ -0,0 +1,210 @@
+<tool id="metaphlan2_hutlab" name="MetaPhlAn2" version="2.5.0">
+  <requirements>
+       <requirement type="package" version="2.5.0">metaphlan2_hutlab</requirement>
+       <requirement type="package">bowtie2</requirement>
+   </requirements>
+
+	<description>metagenomic profiler V2</description>
+	<command interpreter="python">metaphlan2.py $input 
+		--mpa_pkl  \${METAPHLAN2_PATH}/db_v20/mpa_v20_m200.pkl    
+		--bowtie2db \${METAPHLAN2_PATH}//db_v20/mpa_v20_m200 
+		--input_type  fastq
+		--no_map 
+		--bt2_ps $PresetsForBowtie2  
+		#if $str($gchoice_post_mapping.global_choice_post_mapping)  == "1":
+			--tax_lev $gchoice_post_mapping.Taxonomic_Level
+			--min_cu_len $gchoice_post_mapping.min_cu_len
+
+			#if $str($gchoice_post_mapping.Ignore_Viruses)  == "1":
+				--ignore_viruses
+			#end if		
+			#if $str($gchoice_post_mapping.Ignore_Eukaryotes)  == "1":
+				--ignore_eukaryotes   
+			#end if	
+			#if $str($gchoice_post_mapping.Ignore_Bacteria)  == "1":
+				--ignore_bacteria
+			#end if	
+			#if $str($gchoice_post_mapping.Ignore_Archaea)  == "1":
+				--ignore_archaea
+			#end if	
+
+			--stat_q $gchoice_post_mapping.stat_q
+		#end if
+ 
+		#if $str($gchoice_additional_analysis_types.global_additional_analysis_types)  == "1":
+ 			-t $gchoice_additional_analysis_types.Analysis_Type 
+			#if  int($gchoice_additional_analysis_types.nreads) > 0:
+				--nreads $gchoice_additional_analysis_types.nreads
+			#end if	
+			
+			#if  int($gchoice_additional_analysis_types.pres_th) > 0:
+				--pres_th  $gchoice_additional_analysis_types.pres_th
+			#end if	
+			
+			#if $str($gchoice_additional_analysis_types.clade)  != " ":
+				--clade $gchoice_additional_analysis_types.clade 
+				#if  int($gchoice_additional_analysis_types.min_ab) > 0:
+						 --min_ab  $gchoice_additional_analysis_types.min_ab             
+				#end if				
+			#end if				
+			
+		#end if
+ 		-o $output 
+		
+			
+		#if $str($gchoice_biom.global_biom)  == "1":
+			--biom $biom_output
+			--mdelim $gchoice_biom.MetadataDelimiterChar
+		#end if	
+	</command>
+	<inputs>
+		<param format="fastq" name="input" type="data" label="Input metagenome (fastq of metagenomic reads, loaded with the Get Data module )"></param>
+
+		<param name="PresetsForBowtie2" type="select" format="text" >
+			<label>Sensitivity options for read-marker similarity (as described by BowTie2)</label>
+				<option value="very-sensitive">Very Sensitive</option>
+				<option value="sensitive">Sensitive</option>
+				<option value="very-sensitive-local">Very Sensitive  Local</option>
+				<option value="sensitive-local">Sensitive  Local</option>
+
+		</param>	
+		
+		
+		
+	<conditional name="gchoice_post_mapping">
+        	<param name="global_choice_post_mapping" type="select" label="Display Post Mapping Advanced Parameters"  multiple="False" help="Select Post Mapping advanced choices ">
+        		<option value="0" selected='True'>No</option>
+				<option value="1">Yes</option>
+        	</param>
+           	<when value="0">
+ 				<param name="min_cu_len" type="hidden"  value=""  />
+			</when>
+
+        	<when value="1">
+				<param name="Taxonomic_Level" type="select" value="a" format="text" >
+					<label>Taxonomic Level</label>
+						<option value="a">All taxonomic levels</option>
+						<option value="k">Kingdoms (Bacteria and Archaea) only</option>
+						<option value="p">Phyla only</option>
+						<option value="o">Orders only</option>
+						<option value="f">Families only</option>
+						<option value="g">Genera only</option>
+						<option value="s">Species only</option>
+				</param>
+				<param name="min_cu_len" type="integer" size="4"   value="2000" label="Minimum total nucleotide length for the markers in a clade "/>
+
+				<param name="Ignore_Viruses" type="select" label="Profile viral organisms" value="0" >
+						<option value="0">Yes</option>
+						<option value="1">No</option>
+				</param>
+				
+				<param name="Ignore_Eukaryotes" type="select" label="Profile eukaryotic organisms" value="0" >
+						<option value="0">Yes</option>
+						<option value="1">No</option>
+				</param>		
+				
+				<param name="Ignore_Bacteria" type="select" label="Profile bacterial organisms" value="0" >
+						<option value="0">Yes</option>
+						<option value="1">No</option>
+				</param>
+
+				<param name="Ignore_Archaea" type="select" label="Profile archeal organisms" value="0" >
+						<option value="0">Yes</option>
+						<option value="1">No</option>
+				</param>	
+
+				<param name="stat_q" type="float"     value="0.1"  min="0" max="1.0" label=" Quantile value for the robust average "/>
+				
+			</when>
+        </conditional>
+		
+	<conditional name="gchoice_additional_analysis_types">
+        	<param name="global_additional_analysis_types" type="select" label="Display additional analysis types and arguments advanced parameters"  multiple="False" help="Select additional analysis types and argument advanced choices ">
+        		<option value="0" selected='True'>No</option>
+				<option value="1">Yes</option>
+        	</param>
+           	<when value="0">
+ 				<param name="Analysis_Type" type="hidden"  value=""  />
+			</when>
+
+        	<when value="1">
+				<param name="Analysis_Type" type="select" value="rel_ab" format="text" >
+					<label>Analysis Type: Type of Analysis to perform</label>
+						<option value="rel_ab">Profiling a metagenomes in terms of relative abundances</option>
+						<option value="reads_map">Mapping from reads to clades (only reads hitting a marker)</option>
+						<option value="clade_profiles">Normalized marker counts for clades with at least a non-null marker</option>
+						<option value="marker_ab_table">normalized marker counts (only when > 0.0 and normalized by metagenome size if --nreads is specified)</option>
+						<option value="marker_pres_table">list of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th</option>
+				</param>
+				
+				<param name="nreads" type="integer" size="4"   value="0" label="The total number of reads in the original metagenome "/>
+				<param name="pres_th" type="integer" size="4"  value="0"   label="Threshold for calling a marker present"/>
+				<param name="clade" label="The clade for clade specific strain tracker analysis"  value=" " type="text" format="text"/>
+				<param name="min_ab" type="integer" size="4"  value="0" min="0" max="100"  label="The minimum percentage abundance for the clade in the clade specific strain tracker analysis"/>	
+	
+			</when>
+        </conditional>
+		
+		
+		
+		
+		<conditional name="gchoice_biom">
+        	<param name="global_biom" type="select" label="Display additional biom advanced parameters"  help="Select additional biom  choices ">
+        		<option value="0" selected='True'>No</option>
+				<option value="1">Yes</option>
+        	</param>
+           	<when value="0">
+			</when>
+
+        	<when value="1">
+				<param name="MetadataDelimiterChar" label="Delimiter for bug metadata if biom file requested - defaults to pipe. e.g. the pipe in k__Bacteria|p__Proteobacteria"  value="|" type="text" format="text"/>
+			</when>
+        </conditional>
+ 
+
+	</inputs>
+	<outputs>
+		<data name="output" format="tabular" />
+        <data name="biom_output"  format="text">
+            <filter>gchoice_biom["global_biom"] == "1"</filter>
+        </data>
+	 
+
+	</outputs>
+      
+  <help>
+**MetaPhlAn**  is a computational tool for profiling the composition of microbial communities *(Bacteria, Archaea, Eukaryotes and Viruses)* from metagenomic shotgun sequencing data with species level resolution. 
+
+From version 2.0 MetaPhlAn is also able to identify specific strains (in the not-so-frequent cases in which the sample contains a previously sequenced strains) and to track strains across samples for all species.
+
+MetaPhlAn 2.0 relies on ~1M unique clade-specific marker genes identified from ~17,000 reference genomes (~13,500 bacterial and archaeal, ~3,500 viral, and ~110 eukaryotic), allowing:
+
+* Unambiguous taxonomic assignments 
+
+* Accurate estimation of organismal relative abundance 
+
+* Species-level resolution for bacteria, archaea, eukaryotes and viruses 
+
+* Strain identification and tracking
+
+* Orders of magnitude speedups compared to existing methods.
+
+EXAMPLE
+-------
+
+Here is an infographic of the application of the Human Microbiome Project results obtained applying MetaPhlAn on the 690 shotgun sequencing samples.  The image has been produced with GraPhlAn.
+
+
+.. image::    https://bytebucket.org/nsegata/metaphlan/wiki/hmptree13_nl_bb.png
+    :height: 500
+    :width: 600
+
+
+
+If you use this software, please cite :
+=======================================
+
+**Metagenomic microbial community profiling using unique clade-specific marker genes.** Nicola Segata, Levi Waldron, Annalisa Ballarini, Vagheesh Narasimhan, Olivier Jousson, Curtis Huttenhower. Nature Methods, 8, 811–814, 2012
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metaphlan2_hutlab/tool_dependencies.xml	Sun Apr 24 13:19:46 2016 -0400
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="metaphlan2_hutlab" version="2.5.0">
+        <install version="1.0">
+            <actions>
+                <action type="shell_command">hg clone https://bitbucket.org/biobakery/metaphlan2</action>
+                <action type="move_directory_files">
+                    <source_directory>.</source_directory>
+                    <destination_directory>$INSTALL_DIR</destination_directory>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="METAPHLAN2_PATH" action="set_to">$INSTALL_DIR</environment_variable>
+                </action>
+
+            </actions>
+        </install>
+        <readme>
+These links provide information for the metaphlan2 package:
+http://huttenhower.sph.harvard.edu/metaphlan2
+https://groups.google.com/forum/#!forum/metaphlan-users
+        </readme>
+    </package>
+    <package name="numpy" version="1.7">
+        <repository changeset_revision="0c288abd2a1e" name="package_numpy_1_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bowtie2" version="2.1.0">
+        <repository changeset_revision="017a00c265f1" name="package_bowtie2_2_1_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
+
+