diff tools/rgenetics/rgfakePhe.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgfakePhe.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,469 @@
+"""
+fakephe.py
+ross lazarus sept 30 2007
+This is available under the LGPL as defined then.
+
+use the pedigree data for ids
+
+use pythons generators to literally generate a bunch of random phenotype measurements
+
+Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno
+is
+
+To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the
+*.fam file), use the --pheno option:
+plink --file mydata --pheno pheno.txt
+
+where pheno.txt is a file that contains 3 columns (one row per individual):
+
+     Family ID
+     Individual ID
+     Phenotype
+
+NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing),
+unless the --no-pheno flag is given.
+
+If an individual is in the original file but not listed in the alternate phenotype file, that person's phenotype will be set to
+missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of
+the alternate phenotype file need not be the same as for the original file.
+
+If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one
+to be used:
+plink --file mydata --pheno pheno2.txt --mpheno 4
+
+where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis
+(phenotype D):
+
+     Family ID
+     Individual ID
+     Phenotype A
+     Phenotype B
+     Phenotype C
+     Phenotype D
+     Phenotype E
+
+Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which
+phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names
+cannot have any whitespace in them. For example,
+
+     FID    IID      qt1   bmi    site
+     F1     1110     2.3   22.22  2
+     F2     2202     34.12 18.23  1
+     ...
+
+then
+plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc
+
+will select the second phenotype labelled "bmi", for analysis
+
+Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes
+be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then
+plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno
+
+will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use
+the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3.
+
+WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The
+default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the
+main phenotype in the PED/FAM file. This issue will be fixed in future releases.
+Covariate files
+
+===========================
+rgfakePhe.xml
+<tool id="fakePhe1" name="Fake phenos">
+  <description>for multiple null fake phenotype</description>
+  <command interpreter="python2.4">rgfakePhe.py $input1 '$title1' $out_file1 $log_file1 $script_file</command>
+   <inputs>
+    <page>
+    <param name="input1"
+         type="library" format="lped"
+         label="Pedigree from Dataset"/>
+    <param name="title1" type="text"
+         value="My fake phenos" size="60"
+         label="Title for outputs"/>
+    </page>
+    <page>
+    <repeat name="fakePhe" title="Phenotypes to Fake">
+        <param name="pName" type="text" label="Phenotype Name">
+        </param>
+      <conditional name="series">
+        <param name="phetype" type="select" label="Phenotype Type">
+          <option value="rnorm" selected="true">Random normal variate</option>
+          <option value="cat">Random categorical</option>
+        </param>
+        <when value="rnorm">
+          <param name="Mean" type="float" value="0.0" label="Mean">
+          </param>
+          <param name="SD" type="float" label="SD" value="1.0">
+          </param>
+        </when>
+        <when value="cat">
+          <param name="values" type="text" value="1,2,3,fiddle" label="comma separated values to choose from">
+          </param>
+        </when>
+      </conditional>
+    </repeat>
+    </page>
+</inputs>
+<configfiles>
+<configfile name="script_file">
+#for $n, $f in enumerate($fakePhe)
+#if $f.series.phetype=='rnorm'
+{'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"}
+#elif $f.series.phetype=='cat'
+{'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"}
+#end if
+#end for
+</configfile>
+</configfiles>
+
+ <outputs>
+    <data format="pphe" name="out_file1" />
+    <data format="text" name="log_file1" parent="out_file1"/>
+  </outputs>
+
+<help>
+.. class:: infomark
+
+This tool allows you to generate an arbitrary (sort of)
+synthetic phenotype file with measurements drawn from normal,
+gamma, or categorical distributions. These are for testing under
+the null hypothesis of no association - the values are random but
+from user specified distributions.
+
+-----
+
+.. class:: warningmark
+
+This tool is very experimental
+
+-----
+
+- **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes
+- **Title** is a name to give to the output phenotype file
+
+On the next page, you can add an unlimited number of various kinds of phenotypes including choices for
+categorical ones or distributions with specific parameters
+
+Just keep adding new ones until you're done then use the Execute button to run the generation
+
+
+
+
+**Example from another tool to keep you busy and in awe**
+
+Input file::
+
+    1   68  4.1
+    2   71  4.6
+    3   62  3.8
+    4   75  4.4
+    5   58  3.2
+    6   60  3.1
+    7   67  3.8
+    8   68  4.1
+    9   71  4.3
+    10  69  3.7
+
+Create a two series XY plot on the above data:
+
+- Series 1: Red Dashed-Line plot between columns 1 and 2
+- Series 2: Blue Circular-Point plot between columns 3 and 2
+
+.. image:: ./static/images/xy_example.jpg
+</help>
+</tool>
+
+
+
+"""
+
+import random,copy,sys,os,time,string,math
+
+doFbatphe = False
+
+galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
+<title></title>
+<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+</head>
+<body>
+<div class="document">
+"""
+
+
+def timenow():
+    """return current time as a string
+    """
+    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+
+def poisson(lamb=2):
+    """
+    algorithm poisson random number (Knuth):
+    init:
+         Let L = e^-lamb, k = 0 and p = 1.
+    do:
+         k = k + 1.
+         Generate uniform random number u in [0,1] and let p = p u.
+    while p >= L.
+    return k - 1.
+    """
+    lamb = float(lamb)
+    l = math.e**(-lamb)
+    k=0
+    p=1
+    while 1:
+        while p >= l:
+            k += 1
+            u = random.uniform(0.0,1.0)
+            p *= u
+        yield '%e' % (k - 1)
+
+def gammaPhe(alpha=1,beta=1):
+    """generator for random values from a gamma
+    """
+    while 1: # an iterator for a random phenotype
+        dat = random.gammavariate(float(alpha),float(beta))
+        yield '%e' % dat
+
+def weibullPhe(alpha=1,beta=1):
+    """generator for random values from a weibull distribution
+    """
+    while 1: # an iterator for a random phenotype
+        dat = random.weibullvariate(float(alpha),float(beta))
+        yield '%e' % dat
+
+def normPhe(mean=0,sd=1):
+    """generator for random values from a normal distribution
+    """
+    while 1:# an iterator for a random phenotype
+        dat = random.normalvariate(float(mean),float(sd))
+        yield '%e' % dat
+
+def expoPhe(mean=1):
+    """generator for random values from an exponential distribution
+    """
+    lamb = 1.0/float(mean)
+    while 1:# an iterator for a random phenotype
+        dat = random.expovariate(lamb)
+        yield '%e' % dat
+
+
+def catPhe(values='1,2,3'):
+    """ Schrodinger's of course.
+    """
+    v = values.split(',')
+    while 1:# an iterator for a random phenotype
+        dat = random.choice(v)
+        yield dat
+
+def uniPhe(low=0.0,hi=1.0):
+    """generator for a uniform distribution
+       what if low=-5 and hi=-2
+    """
+    low = float(low)
+    hi = float(hi)
+    while 1: # unif phenotype
+        v = random.uniform(low,hi) # 0-1
+        yield '%e' % v
+
+def getIds(indir='foo'):
+    """read identifiers - first 2 cols from a pedigree file or fam file
+    """
+    inpath = os.path.split(indir)[0] # get root
+    infam = '%s.fam' % indir
+    inped = '%s.ped' % indir # if there's a ped
+    flist = os.listdir(inpath)
+    if len(flist) == 0:
+        print >> sys.stderr, '### Error - input path %s is empty' % indir
+        sys.exit(1)
+    if os.path.exists(infam):
+        pfname = infam
+    elif os.path.exists(inped):
+        pfname = inped
+    else:
+        print >> sys.stderr, '### Error - input path %s contains no ped or fam' % indir
+        sys.exit(1)   
+    f = file(pfname,'r')
+    ids = []
+    for l in f:
+        ll = l.split()
+        if len(ll) > 5: # ok line?
+            ids.append(ll[:2])
+    return ids
+
+def makePhe(phes = [],ids=[]):
+    """Create the null phenotype values and append them to the case id
+    phes is the (generator, headername) for each column
+    for a phe file, ids are the case identifiers for the phenotype file
+    res contains the final strings for the file
+    each column is derived by iterating
+    over the generator actions set up by makePhes
+    """
+    header = ['FID','IID'] # for plink
+    res = copy.copy(ids)
+    for (f,fname) in phes:
+        header.append(fname)
+        for n,subject in enumerate(ids):
+            res[n].append(f.next()) # generate the right value
+    res.insert(0,header)
+    res = [' '.join(x) for x in res] # must be space delim for fbat
+    return res
+
+def makePhes(pheTypes=[], pheNames=[], pheParams=[]):
+    """set up phes for makePhe
+    each has to have an iterator (action) and a name
+    """
+    action = None
+    phes = []
+    for n,pt in enumerate(pheTypes):
+        name = pheNames[n]
+        if pt == 'rnorm':
+            m = pheParams[n].get('Mean',0.0)
+            s = pheParams[n].get('SD',1.0)
+            action = normPhe(mean=m,sd=s) # so we can just iterate over action
+        elif pt == 'rgamma':
+            m = pheParams[n].get('Alpha',0.0)
+            s = pheParams[n].get('Beta',1.0)
+            action = gammaPhe(alpha=m,beta=s)
+        if pt == 'exponential':
+            m = pheParams[n].get('Mean',1.0)
+            action = expoPhe(mean=m) # so we can just iterate over action
+        elif pt == 'weibull':
+            m = pheParams[n].get('Alpha',0.0)
+            s = pheParams[n].get('Beta',1.0)
+            action = weibullPhe(alpha=m,beta=s)
+        elif pt == 'cat':
+            v = pheParams[n].get('values',['?',])
+            action = catPhe(values=v)
+        elif pt == 'unif':
+            low = pheParams[n].get('low',0.0)
+            hi = pheParams[n].get('hi',1.0)
+            action = uniPhe(low=low,hi=hi)
+        elif pt == 'poisson':
+            lamb = pheParams[n].get('lamb',1.0)
+            action = poisson(lamb=lamb)
+        phes.append((action,name))
+    return phes
+
+def doImport(outfile='test',flist=[],expl='',mylog=[]):
+    """ import into one of the new html composite data types for Rgenetics
+        Dan Blankenberg with mods by Ross Lazarus
+        October 2007
+    """
+    outf = open(outfile,'w')
+    progname = os.path.basename(sys.argv[0])
+    outf.write(galhtmlprefix % progname)
+    if len(flist) > 0:
+        outf.write('<ol>\n')
+        for i, data in enumerate( flist ):
+           outf.write('<li><a href="%s">%s %s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1],expl))
+        outf.write('</ol>\n')
+    else:
+           outf.write('No files found')
+    outf.write('<hr/><h4>Log of run follows:</h4><br/><pre>%s\n</pre><br/><hr/>' % ('\n'.join(mylog)))
+    outf.write("</div></body></html>\n")
+    outf.close()
+
+
+def test():
+    """test case
+    need to get these out of a galaxy form - series of pages - get types
+    on first screen, names on second, params on third?
+    holy shit. this actually works I think
+    """
+    pT = ['rnorm','rnorm','rnorm','rnorm','cat','unif']
+    pN = ['SysBP','DiaBP','HtCM','WtKG','Race','age']
+    pP = [{'Mean':'120','SD':'10'},{'Mean':'90','SD':'15'},{'Mean':'160','SD':'20'},{'Mean':'60','SD':'20'}, \
+          {'values':'Blink,What,Yours,green'},{'low':16,'hi':99}]
+    phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
+    ids = []
+    for i in range(10):
+        ids.append(['fid%d' % i,'iid%d' % i])
+    pheres = makePhe(phes=phes,ids=ids)
+    res = [''.join(x) for x in pheres]
+    print '\n'.join(res)
+
+
+
+if __name__ == "__main__":
+    """
+   <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name'
+   "$title1" '$ppheout' '$ppheout.files_path' '$script_file '
+   </command>
+    The xml file for this tool is complex, and allows any arbitrary number of
+    phenotype columns to be specified from a couple of optional types - rnorm, cat
+    are working now.
+
+    Note that we create new files in their respective library directories and link to them in the output file
+    so they can be displayed and downloaded separately
+
+    """
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    progname = os.path.basename(sys.argv[0])
+    cl = '## at %s, %s got cl= %s' % (timenow(),progname,' '.join(sys.argv))
+    print >> sys.stdout,cl
+    if len(sys.argv) < 5:
+        test()
+    else:
+        inped = sys.argv[1]
+        title = sys.argv[2].translate(trantab)
+        ppheout = sys.argv[3]
+        pphe_path = sys.argv[4]
+        scriptfile = sys.argv[5]
+        ind = file(scriptfile,'r').readlines()
+        mylog = []
+        s = '## %s starting at %s<br/>\n' % (progname,timenow())
+        mylog.append(s)
+        mylog.append(cl)
+        s = '## params = %s<br/>\n' % (' '.join(sys.argv[1:]))
+        mylog.append(s)
+        s = '\n'.join(ind)
+        mylog.append('Script file %s contained %s<br/>\n' % (scriptfile,s))
+        pT = []
+        pN = []
+        pP = []
+        for l in ind:
+            l = l.strip()
+            if len(l) > 1:
+                adict = eval(l)
+                pT.append(adict.get('pT',None))
+                pN.append(adict.get('pN',None))
+                pP.append(eval(adict.get('pP',None)))
+        s = '## pt,pn,pp=%s,%s,%s<br/>\n' % (str(pT),str(pN),str(pP))
+        mylog.append(s)
+        phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
+        ids = getIds(indir=inped) # for labelling rows
+        pheres = makePhe(phes=phes,ids=ids) # random values from given distributions
+        try:
+            os.makedirs(pphe_path)
+        except:
+            pass
+        outname = os.path.join(pphe_path,title)
+        pphefname = '%s.pphe' % outname
+        f = file(pphefname, 'w')
+        f.write('\n'.join(pheres))
+        f.write('\n')
+        f.close()
+        if doFbatphe:
+            try:
+                os.makedirs(fphe_path)
+            except:
+                pass
+            outname = os.path.join(fphe_path,title)
+            fphefname = '%s.phe' % outname
+            f = file(fphefname, 'w')
+            header = pheres[0].split()
+            pheres[0] = ' '.join(header[2:])# remove iid fid from header for fbat
+            f.write('\n'.join(pheres))
+            f.close()
+            doImport(outfile=fpheout,flist=[fphefname,],expl='(FBAT phenotype format)',mylog=mylog)
+        #doImport(outfile='test',flist=[],expl='',mylog=[]):
+        doImport(outfile=ppheout,flist=[pphefname,],expl='(Plink phenotype format)',mylog=mylog)
+