genome_diversity: make_phylip.py comparison

comparison make_phylip.py @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3

author	Richard Burhans <burhans@bx.psu.edu>
date	Fri, 20 Sep 2013 13:25:27 -0400
parents
children	ea52b23f1141

comparison

equal deleted inserted replaced

-:4188853b940b
+:a631c2f6d913
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       mkFastas.py
+#
+#       Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
+#
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+import argparse
+import errno
+import os
+import shutil
+def mkdir_p(path):
+try:
+os.makedirs(path)
+except OSError, e:
+if e.errno <> errno.EEXIST:
+raise
+def revseq(seq):
+seq=list(seq)
+seq.reverse()
+seq=''.join(seq)
+return seq
+def revComp(allPop):
+dAllCompAll={'A':'T','T':'A','C':'G','G':'C','N':'N','M':'K','K':'M','R':'Y','Y':'R','W':'W','S':'S'}
+allPopsComp=dAllCompAll[allPop]
+return allPopsComp
+def rtrnCons(ntA,ntB):
+srtdPairs=''.join(sorted([ntA,ntB]))
+dpairsCons={'AC':'M', 'AG':'R', 'AT':'W', 'CG':'S', 'CT':'Y', 'GT':'K', 'AN':'A', 'CN':'C', 'GN':'G', 'NT':'T'}
+cons=dpairsCons[srtdPairs]
+return cons
+def rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB,fulldChrdPosdPopsAlllsInit=False,cvrgTreshold=False,indvlsPrctTrshld=False):
+"""
+"""
+dChrdPosdPopsAlllsInit={}
+seqref=[]
+for eachl in open(inSNPf,'r'):
+if eachl.strip() and eachl[0]!='#':
+fllInfoSplt=eachl.splitlines()[0].split('\t')
+chrx=fllInfoSplt[pxchrx]
+pos=int(fllInfoSplt[pxpos])
+ntA=fllInfoSplt[pxntA]
+ntB=fllInfoSplt[pxntB]
+seqref.append([pos,ntA])
+dPopsAllls={}
+if fulldChrdPosdPopsAlllsInit:
+#~
+cntIndv=0
+#
+try:
+fulldPopsAllls=fulldChrdPosdPopsAlllsInit[chrx][pos]
+except:
+fulldPopsAllls=dict([(echPop,ntA) for echPop in dPopsinSNPfPos])
+#
+for eachPop in dPopsinSNPfPos:
+clmnCvrg=dPopsinSNPfPos[eachPop]
+if clmnCvrg:
+eachPopCvrg=int(fllInfoSplt[clmnCvrg])
+else:
+#~ eachPopCvrg=0
+eachPopCvrg=cvrgTreshold
+if eachPopCvrg>=cvrgTreshold:
+dPopsAllls[eachPop]=fulldPopsAllls[eachPop]
+cntIndv+=1
+else:
+dPopsAllls[eachPop]='N'
+#~
+if indvlsPrctTrshld>(cntIndv/float(len(dPopsinSNPfPos))):
+dPopsAllls=dict([(echPop,'N') for echPop in dPopsinSNPfPos])
+#~
+else:
+for eachPop in dPopsinSNPfPos:
+if dPopsinSNPfPos[eachPop]:
+eachPopAll=int(fllInfoSplt[dPopsinSNPfPos[eachPop]])
+if eachPopAll==0:
+dPopsAllls[eachPop]=ntB
+elif eachPopAll==2:
+dPopsAllls[eachPop]=ntA
+elif eachPopAll==1:
+dPopsAllls[eachPop]=rtrnCons(ntA,ntB)
+else:
+dPopsAllls[eachPop]='N'
+else:
+dPopsAllls[eachPop]=ntA
+try:
+dChrdPosdPopsAlllsInit[chrx][pos]=dPopsAllls
+except:
+dChrdPosdPopsAlllsInit[chrx]={pos:dPopsAllls}
+#~
+seqref.sort()
+startExs=[seqref[0][0]]
+endExs=[seqref[-1][0]+1]
+seqref=''.join(x[1] for x in seqref)
+#~
+return dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs
+def rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn):
+"""
+"""
+dENSEMBLTchrxStEndEx={}
+dChrdStrtEndENSEMBLT={}
+for eachl in open(inUCSCfile,'r'):
+if eachl.strip():
+rvrse=False
+allVls=eachl.split('\t')
+txStart=allVls[txStartClmn]
+txEnd=allVls[txEndClmn]
+ENSEMBLT=allVls[geneNameClmn]
+strand=allVls[strandClmn]
+chrx=allVls[fchrClmn]
+if cdsStartClmn and cdsEndClmn:
+cdsStart=allVls[cdsStartClmn]
+cdsEnd=allVls[cdsEndClmn]
+if startExsClmn and endExsClmn:
+startExs=allVls[startExsClmn]
+endExs=allVls[endExsClmn]
+if strand=='-':
+rvrse=True
+try:
+dChrdStrtEndENSEMBLT[chrx][int(txStart),int(txEnd)]=ENSEMBLT
+except:
+try:
+dChrdStrtEndENSEMBLT[chrx]={(int(txStart),int(txEnd)):ENSEMBLT}
+except:
+dChrdStrtEndENSEMBLT={chrx:{(int(txStart),int(txEnd)):ENSEMBLT}}
+#~
+if cdsStartClmn and cdsEndClmn and startExsClmn and endExsClmn:
+startExs,endExs=rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd)
+else:
+startExs,endExs=[int(txStart)],[int(txEnd)]
+dENSEMBLTchrxStEndEx[ENSEMBLT]=(chrx,startExs,endExs,rvrse)
+#~
+dENSEMBLTseq={}
+ENSEMBLTseqs=[(x.splitlines()[0],''.join(x.splitlines()[1:])) for x in open(inCDSfile).read().split('>') if x.strip()]
+for ENSEMBLT,seq in ENSEMBLTseqs:
+dENSEMBLTseq[ENSEMBLT]=seq
+#~
+dENSEMBLTseqChrStEnEx={}
+for ENSEMBLT in dENSEMBLTchrxStEndEx:
+chrx,startExs,endExs,rvrse=dENSEMBLTchrxStEndEx[ENSEMBLT]
+addEseqChrStEnEx=True
+try:
+seq=dENSEMBLTseq[ENSEMBLT]
+if rvrse:
+seq=revseq(seq)
+except:
+addEseqChrStEnEx=False
+if addEseqChrStEnEx:
+dENSEMBLTseqChrStEnEx[ENSEMBLT]=(seq,chrx,startExs,endExs,rvrse)
+return dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT
+def rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit):
+"""
+"""
+dENSEMBLTChrPosdAlls={}
+dChrPosdPopsAllls={}
+todel=set(dChrdPosdPopsAlllsInit.keys()).difference(set(dChrdStrtEndENSEMBLT.keys()))
+for x in todel:
+x=dChrdPosdPopsAlllsInit.pop(x)
+#---
+while len(dChrdPosdPopsAlllsInit)>0:
+chrx=dChrdPosdPopsAlllsInit.keys()[0]
+dStrtEndENSEMBLT=dChrdStrtEndENSEMBLT.pop(chrx)
+dPosdPopsAllls=dChrdPosdPopsAlllsInit.pop(chrx)
+#~
+srtdStrtEndENSEMBLT=sorted(dStrtEndENSEMBLT.keys())
+srtdPosdPopsAllls=sorted(dPosdPopsAllls.keys())
+#~
+pos=srtdPosdPopsAllls.pop(0)
+strt,end=srtdStrtEndENSEMBLT.pop(0)
+ENSEMBLT=dStrtEndENSEMBLT[strt,end]
+dPopsAllls=dPosdPopsAllls[pos]
+keePloop=True
+#~
+while keePloop:
+if strt<=pos<=end:
+for tmpstrt,tmpend in [(strt,end)]+srtdStrtEndENSEMBLT:
+if tmpstrt<=pos<=tmpend:
+dPopsAllls=dPosdPopsAllls[pos]
+dChrPosdPopsAllls[chrx,pos]=dPopsAllls
+try:
+dENSEMBLTChrPosdAlls[ENSEMBLT][chrx,pos]=dPopsAllls
+except:
+dENSEMBLTChrPosdAlls[ENSEMBLT]={(chrx,pos):dPopsAllls}
+else:
+continue
+if len(srtdPosdPopsAllls)>0:
+pos=srtdPosdPopsAllls.pop(0)
+dPopsAllls=dPosdPopsAllls[pos]
+else:
+keePloop=False
+#~
+elif pos<=strt:
+if len(srtdPosdPopsAllls)>0:
+pos=srtdPosdPopsAllls.pop(0)
+dPopsAllls=dPosdPopsAllls[pos]
+else:
+keePloop=False
+else:
+if len(srtdStrtEndENSEMBLT)>0:
+strt,end=srtdStrtEndENSEMBLT.pop(0)
+ENSEMBLT=dStrtEndENSEMBLT[strt,end]
+else:
+keePloop=False
+return dENSEMBLTChrPosdAlls,dChrPosdPopsAllls
+def rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd):
+"""
+"""
+cdsStart,cdsEnd=int(cdsStart),int(cdsEnd)
+crrctdstartExs=set([int(x) for x in startExs.split(',') if x.strip()])
+crrctdendExs=set([int(x) for x in endExs.split(',') if x.strip()])
+crrctdstartExs.add(cdsStart)
+crrctdendExs.add(cdsEnd)
+sStartDel=set()
+sEndDel=set()
+#~
+for echvl in crrctdstartExs:
+if echvl<cdsStart or echvl>cdsEnd:
+sStartDel.add(echvl)
+#~
+for echvl in crrctdendExs:
+if echvl<cdsStart or echvl>cdsEnd:
+sEndDel.add(echvl)
+#~
+return sorted(crrctdstartExs.difference(sStartDel)),sorted(crrctdendExs.difference(sEndDel))
+def rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT):
+"""
+"""
+exnIntrvl=zip(startExs,endExs)
+CDSinitPos=exnIntrvl[0][0]
+dexnIntrvlSeq={}
+for exStart,exEnd in exnIntrvl:
+lenEx=exEnd-exStart
+dexnIntrvlSeq[exStart,exEnd]=seq[:lenEx]
+seq=seq[lenEx:]
+ldexnIntrvlSeq=len(dexnIntrvlSeq)
+#~
+dPopsFasta={}
+#~
+strePos=set()
+dStrePosAbsPos={}
+tmpAcmltdPos=0
+#~
+exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
+seq=dexnIntrvlSeq.pop((exStart,exEnd))
+chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
+cntExns=0
+while True:
+if  exStart<=pos<=exEnd-1:
+relPos=tmpAcmltdPos+pos-exStart
+strePos.add(relPos)
+dStrePosAbsPos[relPos]=pos
+for echPop in tmpdPopsFasta:
+allPop=dPopsAllls[echPop]
+if rvrse:
+allPop=revComp(allPop)
+tmpdPopsFasta[echPop][pos-exStart]=allPop
+if len(dChrPosdPopsAllls)>0:
+chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+else:
+pos=endExs[-1]+100#max pos of exns
+elif pos<exStart:
+if len(dChrPosdPopsAllls)>0:
+chrx,pos=sorted(dChrPosdPopsAllls.keys())[0]
+dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos))
+else:
+pos=endExs[-1]+100#max pos of exns
+elif pos>exEnd-1:# or len(dChrPosdPopsAllls)==0:
+for echPop in tmpdPopsFasta:
+try:
+dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
+except:
+dPopsFasta[echPop]=''.join(tmpdPopsFasta[echPop])
+cntExns+=1
+tmpAcmltdPos+=len(seq)
+if len(dexnIntrvlSeq)>0:
+exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0]
+seq=dexnIntrvlSeq.pop((exStart,exEnd))
+tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls])
+else:
+break
+if ldexnIntrvlSeq!=cntExns:
+for echPop in tmpdPopsFasta:
+dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop])
+#~
+lchrStartexEndpos=[]
+if rvrse:
+dPopsFasta=dict([(echPop,revseq(dPopsFasta[echPop])) for echPop in dPopsFasta])#[echPop]+=''.join(tmpdPopsFasta[echPop])
+for ePos in strePos:
+lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(tmpAcmltdPos-ePos-1),str(dStrePosAbsPos[ePos])]))
+else:
+for ePos in strePos:
+lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(ePos),str(dStrePosAbsPos[ePos])]))
+#~
+return dPopsFasta,lchrStartexEndpos
+def rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls):
+"""
+"""
+dENSEMBLTPopsFasta={}
+lchrStartexEndposAll=[]
+#~
+sENSEMBLTcmmn=set(dENSEMBLTChrPosdAlls.keys()).intersection(set(dENSEMBLTseqChrStEnEx.keys()))#sENSEMBLTcmmn between UCSC and ENSEMBLE
+#~
+for ENSEMBLT in sENSEMBLTcmmn:
+seq,chrx,startExs,endExs,rvrse=dENSEMBLTseqChrStEnEx[ENSEMBLT]
+dChrPosdPopsAllls=dENSEMBLTChrPosdAlls[ENSEMBLT]
+if len(startExs)>0 and len(endExs)>0:
+dPopsFasta,lchrStartexEndpos=rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT)
+lchrStartexEndposAll.extend(lchrStartexEndpos)
+if dPopsFasta:#to correct a bug of the input table, in cases in which endExons<startExn (!). See ENSCAFT00000000145 (MC4R) in canFam2 for example.
+dENSEMBLTPopsFasta[ENSEMBLT]=dPopsFasta
+return dENSEMBLTPopsFasta,lchrStartexEndposAll
+def rtrnPhy(dPopsFasta,ENSEMBLT):
+"""
+"""
+dPopsFormPhy={}
+for eachPop in dPopsFasta:
+hader='%s'%eachPop
+#~ hader='>%s'%eachPop
+seq=dPopsFasta[eachPop]
+formtd='\t'.join([hader,seq])
+#~ formtd='\n'.join([hader,seq])
+dPopsFormPhy[eachPop]=formtd
+#~
+return dPopsFormPhy,len(seq)
+def wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst):
+"""
+"""
+ENSEMBLTKaKs=[]
+nonHeader=True
+cnt=0
+lENSEMBLT=len(dENSEMBLTPopsFasta)
+#~
+for ENSEMBLT in sorted(dENSEMBLTPopsFasta.keys()):
+cnt+=1
+dPopsFasta=dENSEMBLTPopsFasta[ENSEMBLT]
+dPopsFormPhy,lenseq=rtrnPhy(dPopsFasta,ENSEMBLT)
+#~
+seqPMLformat=['%s %s'%(len(dPopsFormPhy),lenseq)]#generate new PHYML sequence
+#~ seqPMLformat=[]#generate new PHYML sequence
+for namex in sorted(sPopsIntrst):
+seqPMLformat.append(dPopsFormPhy[namex])
+#~
+mkdir_p(outFastaFold)
+outFastaf=os.path.join(outFastaFold,'%s.phy'%ENSEMBLT)
+outFastaf=open(outFastaf,'w')
+outFastaf.write('\n'.join(seqPMLformat))
+outFastaf.close()
+#~
+return 0
+def main():
+#~
+#~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0
+parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
+parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True)
+parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True)
+parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True)
+parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True)
+parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True)
+parser.add_argument('--output',metavar='output',type=str,help='the output',required=True)
+parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True)
+parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True)
+parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True)
+#~
+parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False)
+parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False)
+parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False)
+parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False)
+parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False)
+parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False)
+parser.add_argument('--altrClmnCvrg',metavar='int',type=int,help='the column with the derived nucleotide in the input coverage file.',required=False,default=False)
+parser.add_argument('--indvlsPrctTrshld',metavar='int',type=float,help='the percentage of individual above which nucleotides are included, else "N".',required=False,default=False)
+#~
+parser.add_argument('--sequence',metavar='input fasta file',type=str,help='the input file with the sequence whose SNPs are in the input.',required=False,default=False)
+parser.add_argument('--gene_info',metavar='input interval file',type=str,help='the input interval file with the the information on the genes.',required=False,default=False)
+parser.add_argument('--fchrClmn',metavar='int',type=int,help='the column with the chromosome in the gene_info file.',required=False,default=False)
+parser.add_argument('--txStartClmn',metavar='int',type=int,help='the column with the transcript start column in the gene_info file.',required=False,default=False)
+parser.add_argument('--txEndClmn',metavar='int',type=int,help='the column with the transcript end column in the gene_info file.',required=False,default=False)
+parser.add_argument('--strandClmn',metavar='int',type=int,help='the column with the strand column in the gene_info file.',required=False,default=False)
+parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False)
+parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False)
+parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False)
+parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False)
+parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False)
+args = parser.parse_args()
+inSNPf = args.input
+outfile = args.output
+outfile_id = args.output_id
+outFastaFold = './out'
+files_dir = args.output_dir
+gd_indivs = args.gd_indivs
+pxchrx = args.chrClmn
+pxpos = args.posClmn
+pxntA = args.refClmn
+pxntB = args.altrClmn
+inCDSfile = args.sequence
+inUCSCfile = args.gene_info
+fchrClmn = args.fchrClmn#chromosome column
+txStartClmn = args.txStartClmn#transcript start column
+txEndClmn = args.txEndClmn#transcript end column
+strandClmn = args.strandClmn#strand column
+geneNameClmn = args.geneNameClmn#gene name column
+cdsStartClmn = args.cdsStartClmn#coding sequence start column
+cdsEndClmn = args.cdsEndClmn#coding sequence end column
+startExsClmn = args.startExsClmn#exons start column
+endExsClmn = args.endExsClmn#exons end column
+inputCover = args.inputCover
+gd_indivs_cover = args.gd_indivs_cover
+cvrgTreshold = args.cvrgTreshold
+pxchrxCov = args.chrClmnCvrg
+pxposCov = args.posClmnCvrg
+pxntACov = args.refClmnCvrg
+pxntBCov = args.altrClmnCvrg
+indvlsPrctTrshld = args.indvlsPrctTrshld
+#print inputCover, gd_indivs_cover, cvrgTreshold
+assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile))
+#~
+dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()])
+#~ dPopsinSNPfPos.update({'ref':False})
+#~
+sPopsIntrst=set(dPopsinSNPfPos.keys())
+dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...'
+#~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile)
+#~
+if  inputCover and gd_indivs_cover and cvrgTreshold>=0:
+dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()])
+dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()]))
+dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld)
+rvrse=False
+dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)}
+dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}}
+#~
+elif inCDSfile and inUCSCfile:
+dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn)#~ print '2. Getting transcripts and exons information...'
+#~
+dENSEMBLTChrPosdAlls,dChrPosdPopsAllls=rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit)#~ print '3. Getting fixed alleles in exons...'
+#~
+dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...'
+#~
+wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst)
+#~
+## get a lit of output files
+files = []
+for dirpath, dirnames, filenames in os.walk(outFastaFold):
+for file in filenames:
+if file.endswith('.phy'):
+files.append( os.path.join(dirpath, file) )
+del dirnames[:]
+if len(files) == 0:
+with open(outfile, 'w') as ofh:
+print >> ofh, 'No output.'
+else:
+## the first file becomes the output
+file = files.pop(0)
+shutil.move(file, outfile)
+## rename/move the rest of the files
+for i, file in enumerate(files):
+new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2)
+new_pathname = os.path.join(files_dir, new_filename)
+shutil.move(file, new_pathname)
+return 0
+if __name__ == '__main__':
+main()

Mercurial > repos > miller-lab > genome_diversity

comparison make_phylip.py @ 31:a631c2f6d913