Mercurial > repos > miller-lab > genome_diversity
diff make_phylip.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | |
children | ea52b23f1141 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_phylip.py Fri Sep 20 13:25:27 2013 -0400 @@ -0,0 +1,511 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# mkFastas.py +# +# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import errno +import os +import shutil + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError, e: + if e.errno <> errno.EEXIST: + raise + +def revseq(seq): + seq=list(seq) + seq.reverse() + seq=''.join(seq) + return seq + +def revComp(allPop): + dAllCompAll={'A':'T','T':'A','C':'G','G':'C','N':'N','M':'K','K':'M','R':'Y','Y':'R','W':'W','S':'S'} + allPopsComp=dAllCompAll[allPop] + return allPopsComp + +def rtrnCons(ntA,ntB): + srtdPairs=''.join(sorted([ntA,ntB])) + dpairsCons={'AC':'M', 'AG':'R', 'AT':'W', 'CG':'S', 'CT':'Y', 'GT':'K', 'AN':'A', 'CN':'C', 'GN':'G', 'NT':'T'} + cons=dpairsCons[srtdPairs] + return cons + +def rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB,fulldChrdPosdPopsAlllsInit=False,cvrgTreshold=False,indvlsPrctTrshld=False): + """ + """ + dChrdPosdPopsAlllsInit={} + seqref=[] + for eachl in open(inSNPf,'r'): + if eachl.strip() and eachl[0]!='#': + fllInfoSplt=eachl.splitlines()[0].split('\t') + chrx=fllInfoSplt[pxchrx] + pos=int(fllInfoSplt[pxpos]) + ntA=fllInfoSplt[pxntA] + ntB=fllInfoSplt[pxntB] + seqref.append([pos,ntA]) + dPopsAllls={} + if fulldChrdPosdPopsAlllsInit: + #~ + cntIndv=0 + # + try: + fulldPopsAllls=fulldChrdPosdPopsAlllsInit[chrx][pos] + except: + fulldPopsAllls=dict([(echPop,ntA) for echPop in dPopsinSNPfPos]) + # + for eachPop in dPopsinSNPfPos: + clmnCvrg=dPopsinSNPfPos[eachPop] + if clmnCvrg: + eachPopCvrg=int(fllInfoSplt[clmnCvrg]) + else: + #~ eachPopCvrg=0 + eachPopCvrg=cvrgTreshold + if eachPopCvrg>=cvrgTreshold: + dPopsAllls[eachPop]=fulldPopsAllls[eachPop] + cntIndv+=1 + else: + dPopsAllls[eachPop]='N' + #~ + if indvlsPrctTrshld>(cntIndv/float(len(dPopsinSNPfPos))): + dPopsAllls=dict([(echPop,'N') for echPop in dPopsinSNPfPos]) + #~ + else: + for eachPop in dPopsinSNPfPos: + if dPopsinSNPfPos[eachPop]: + eachPopAll=int(fllInfoSplt[dPopsinSNPfPos[eachPop]]) + if eachPopAll==0: + dPopsAllls[eachPop]=ntB + elif eachPopAll==2: + dPopsAllls[eachPop]=ntA + elif eachPopAll==1: + dPopsAllls[eachPop]=rtrnCons(ntA,ntB) + else: + dPopsAllls[eachPop]='N' + else: + dPopsAllls[eachPop]=ntA + try: + dChrdPosdPopsAlllsInit[chrx][pos]=dPopsAllls + except: + dChrdPosdPopsAlllsInit[chrx]={pos:dPopsAllls} + #~ + seqref.sort() + startExs=[seqref[0][0]] + endExs=[seqref[-1][0]+1] + seqref=''.join(x[1] for x in seqref) + #~ + return dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs + + +def rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn): + """ + """ + dENSEMBLTchrxStEndEx={} + dChrdStrtEndENSEMBLT={} + for eachl in open(inUCSCfile,'r'): + if eachl.strip(): + rvrse=False + allVls=eachl.split('\t') + txStart=allVls[txStartClmn] + txEnd=allVls[txEndClmn] + ENSEMBLT=allVls[geneNameClmn] + strand=allVls[strandClmn] + chrx=allVls[fchrClmn] + if cdsStartClmn and cdsEndClmn: + cdsStart=allVls[cdsStartClmn] + cdsEnd=allVls[cdsEndClmn] + if startExsClmn and endExsClmn: + startExs=allVls[startExsClmn] + endExs=allVls[endExsClmn] + if strand=='-': + rvrse=True + try: + dChrdStrtEndENSEMBLT[chrx][int(txStart),int(txEnd)]=ENSEMBLT + except: + try: + dChrdStrtEndENSEMBLT[chrx]={(int(txStart),int(txEnd)):ENSEMBLT} + except: + dChrdStrtEndENSEMBLT={chrx:{(int(txStart),int(txEnd)):ENSEMBLT}} + #~ + if cdsStartClmn and cdsEndClmn and startExsClmn and endExsClmn: + startExs,endExs=rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd) + else: + startExs,endExs=[int(txStart)],[int(txEnd)] + dENSEMBLTchrxStEndEx[ENSEMBLT]=(chrx,startExs,endExs,rvrse) + #~ + dENSEMBLTseq={} + ENSEMBLTseqs=[(x.splitlines()[0],''.join(x.splitlines()[1:])) for x in open(inCDSfile).read().split('>') if x.strip()] + for ENSEMBLT,seq in ENSEMBLTseqs: + dENSEMBLTseq[ENSEMBLT]=seq + #~ + dENSEMBLTseqChrStEnEx={} + for ENSEMBLT in dENSEMBLTchrxStEndEx: + chrx,startExs,endExs,rvrse=dENSEMBLTchrxStEndEx[ENSEMBLT] + addEseqChrStEnEx=True + try: + seq=dENSEMBLTseq[ENSEMBLT] + if rvrse: + seq=revseq(seq) + except: + addEseqChrStEnEx=False + if addEseqChrStEnEx: + dENSEMBLTseqChrStEnEx[ENSEMBLT]=(seq,chrx,startExs,endExs,rvrse) + return dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT + + +def rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit): + """ + """ + dENSEMBLTChrPosdAlls={} + dChrPosdPopsAllls={} + todel=set(dChrdPosdPopsAlllsInit.keys()).difference(set(dChrdStrtEndENSEMBLT.keys())) + for x in todel: + x=dChrdPosdPopsAlllsInit.pop(x) + #--- + while len(dChrdPosdPopsAlllsInit)>0: + chrx=dChrdPosdPopsAlllsInit.keys()[0] + dStrtEndENSEMBLT=dChrdStrtEndENSEMBLT.pop(chrx) + dPosdPopsAllls=dChrdPosdPopsAlllsInit.pop(chrx) + #~ + srtdStrtEndENSEMBLT=sorted(dStrtEndENSEMBLT.keys()) + srtdPosdPopsAllls=sorted(dPosdPopsAllls.keys()) + #~ + pos=srtdPosdPopsAllls.pop(0) + strt,end=srtdStrtEndENSEMBLT.pop(0) + ENSEMBLT=dStrtEndENSEMBLT[strt,end] + dPopsAllls=dPosdPopsAllls[pos] + keePloop=True + #~ + while keePloop: + if strt<=pos<=end: + for tmpstrt,tmpend in [(strt,end)]+srtdStrtEndENSEMBLT: + if tmpstrt<=pos<=tmpend: + dPopsAllls=dPosdPopsAllls[pos] + dChrPosdPopsAllls[chrx,pos]=dPopsAllls + try: + dENSEMBLTChrPosdAlls[ENSEMBLT][chrx,pos]=dPopsAllls + except: + dENSEMBLTChrPosdAlls[ENSEMBLT]={(chrx,pos):dPopsAllls} + else: + continue + if len(srtdPosdPopsAllls)>0: + pos=srtdPosdPopsAllls.pop(0) + dPopsAllls=dPosdPopsAllls[pos] + else: + keePloop=False + #~ + elif pos<=strt: + if len(srtdPosdPopsAllls)>0: + pos=srtdPosdPopsAllls.pop(0) + dPopsAllls=dPosdPopsAllls[pos] + else: + keePloop=False + else: + if len(srtdStrtEndENSEMBLT)>0: + strt,end=srtdStrtEndENSEMBLT.pop(0) + ENSEMBLT=dStrtEndENSEMBLT[strt,end] + else: + keePloop=False + return dENSEMBLTChrPosdAlls,dChrPosdPopsAllls + +def rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd): + """ + """ + cdsStart,cdsEnd=int(cdsStart),int(cdsEnd) + crrctdstartExs=set([int(x) for x in startExs.split(',') if x.strip()]) + crrctdendExs=set([int(x) for x in endExs.split(',') if x.strip()]) + crrctdstartExs.add(cdsStart) + crrctdendExs.add(cdsEnd) + sStartDel=set() + sEndDel=set() + #~ + for echvl in crrctdstartExs: + if echvl<cdsStart or echvl>cdsEnd: + sStartDel.add(echvl) + #~ + for echvl in crrctdendExs: + if echvl<cdsStart or echvl>cdsEnd: + sEndDel.add(echvl) + #~ + return sorted(crrctdstartExs.difference(sStartDel)),sorted(crrctdendExs.difference(sEndDel)) + +def rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT): + """ + """ + exnIntrvl=zip(startExs,endExs) + CDSinitPos=exnIntrvl[0][0] + dexnIntrvlSeq={} + for exStart,exEnd in exnIntrvl: + lenEx=exEnd-exStart + dexnIntrvlSeq[exStart,exEnd]=seq[:lenEx] + seq=seq[lenEx:] + + ldexnIntrvlSeq=len(dexnIntrvlSeq) + #~ + dPopsFasta={} + #~ + strePos=set() + dStrePosAbsPos={} + tmpAcmltdPos=0 + #~ + exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0] + seq=dexnIntrvlSeq.pop((exStart,exEnd)) + chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] + dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) + tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls]) + cntExns=0 + while True: + if exStart<=pos<=exEnd-1: + relPos=tmpAcmltdPos+pos-exStart + strePos.add(relPos) + dStrePosAbsPos[relPos]=pos + for echPop in tmpdPopsFasta: + allPop=dPopsAllls[echPop] + if rvrse: + allPop=revComp(allPop) + tmpdPopsFasta[echPop][pos-exStart]=allPop + if len(dChrPosdPopsAllls)>0: + chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] + dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) + else: + pos=endExs[-1]+100#max pos of exns + elif pos<exStart: + if len(dChrPosdPopsAllls)>0: + chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] + dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) + else: + pos=endExs[-1]+100#max pos of exns + elif pos>exEnd-1:# or len(dChrPosdPopsAllls)==0: + for echPop in tmpdPopsFasta: + try: + dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop]) + except: + dPopsFasta[echPop]=''.join(tmpdPopsFasta[echPop]) + cntExns+=1 + tmpAcmltdPos+=len(seq) + if len(dexnIntrvlSeq)>0: + exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0] + seq=dexnIntrvlSeq.pop((exStart,exEnd)) + tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls]) + else: + break + if ldexnIntrvlSeq!=cntExns: + for echPop in tmpdPopsFasta: + dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop]) + #~ + lchrStartexEndpos=[] + if rvrse: + dPopsFasta=dict([(echPop,revseq(dPopsFasta[echPop])) for echPop in dPopsFasta])#[echPop]+=''.join(tmpdPopsFasta[echPop]) + for ePos in strePos: + lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(tmpAcmltdPos-ePos-1),str(dStrePosAbsPos[ePos])])) + else: + for ePos in strePos: + lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(ePos),str(dStrePosAbsPos[ePos])])) + #~ + return dPopsFasta,lchrStartexEndpos + +def rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls): + """ + """ + dENSEMBLTPopsFasta={} + lchrStartexEndposAll=[] + #~ + sENSEMBLTcmmn=set(dENSEMBLTChrPosdAlls.keys()).intersection(set(dENSEMBLTseqChrStEnEx.keys()))#sENSEMBLTcmmn between UCSC and ENSEMBLE + #~ + for ENSEMBLT in sENSEMBLTcmmn: + seq,chrx,startExs,endExs,rvrse=dENSEMBLTseqChrStEnEx[ENSEMBLT] + dChrPosdPopsAllls=dENSEMBLTChrPosdAlls[ENSEMBLT] + if len(startExs)>0 and len(endExs)>0: + dPopsFasta,lchrStartexEndpos=rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT) + lchrStartexEndposAll.extend(lchrStartexEndpos) + if dPopsFasta:#to correct a bug of the input table, in cases in which endExons<startExn (!). See ENSCAFT00000000145 (MC4R) in canFam2 for example. + dENSEMBLTPopsFasta[ENSEMBLT]=dPopsFasta + return dENSEMBLTPopsFasta,lchrStartexEndposAll + + + +def rtrnPhy(dPopsFasta,ENSEMBLT): + """ + """ + dPopsFormPhy={} + for eachPop in dPopsFasta: + hader='%s'%eachPop + #~ hader='>%s'%eachPop + seq=dPopsFasta[eachPop] + formtd='\t'.join([hader,seq]) + #~ formtd='\n'.join([hader,seq]) + dPopsFormPhy[eachPop]=formtd + #~ + return dPopsFormPhy,len(seq) + +def wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst): + """ + """ + ENSEMBLTKaKs=[] + nonHeader=True + cnt=0 + lENSEMBLT=len(dENSEMBLTPopsFasta) + #~ + for ENSEMBLT in sorted(dENSEMBLTPopsFasta.keys()): + cnt+=1 + dPopsFasta=dENSEMBLTPopsFasta[ENSEMBLT] + dPopsFormPhy,lenseq=rtrnPhy(dPopsFasta,ENSEMBLT) + #~ + seqPMLformat=['%s %s'%(len(dPopsFormPhy),lenseq)]#generate new PHYML sequence + #~ seqPMLformat=[]#generate new PHYML sequence + for namex in sorted(sPopsIntrst): + seqPMLformat.append(dPopsFormPhy[namex]) + #~ + mkdir_p(outFastaFold) + outFastaf=os.path.join(outFastaFold,'%s.phy'%ENSEMBLT) + outFastaf=open(outFastaf,'w') + outFastaf.write('\n'.join(seqPMLformat)) + outFastaf.close() + #~ + return 0 + +def main(): + #~ + #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 + parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') + parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) + parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) + parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) + parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) + parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True) + parser.add_argument('--output',metavar='output',type=str,help='the output',required=True) + parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True) + parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True) + parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) + #~ + parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) + parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) + parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) + parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) + parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False) + parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False) + parser.add_argument('--altrClmnCvrg',metavar='int',type=int,help='the column with the derived nucleotide in the input coverage file.',required=False,default=False) + parser.add_argument('--indvlsPrctTrshld',metavar='int',type=float,help='the percentage of individual above which nucleotides are included, else "N".',required=False,default=False) + #~ + parser.add_argument('--sequence',metavar='input fasta file',type=str,help='the input file with the sequence whose SNPs are in the input.',required=False,default=False) + parser.add_argument('--gene_info',metavar='input interval file',type=str,help='the input interval file with the the information on the genes.',required=False,default=False) + parser.add_argument('--fchrClmn',metavar='int',type=int,help='the column with the chromosome in the gene_info file.',required=False,default=False) + parser.add_argument('--txStartClmn',metavar='int',type=int,help='the column with the transcript start column in the gene_info file.',required=False,default=False) + parser.add_argument('--txEndClmn',metavar='int',type=int,help='the column with the transcript end column in the gene_info file.',required=False,default=False) + parser.add_argument('--strandClmn',metavar='int',type=int,help='the column with the strand column in the gene_info file.',required=False,default=False) + parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False) + parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False) + parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) + parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) + parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) + + args = parser.parse_args() + + inSNPf = args.input + outfile = args.output + outfile_id = args.output_id + outFastaFold = './out' + files_dir = args.output_dir + gd_indivs = args.gd_indivs + pxchrx = args.chrClmn + pxpos = args.posClmn + pxntA = args.refClmn + pxntB = args.altrClmn + + + inCDSfile = args.sequence + inUCSCfile = args.gene_info + fchrClmn = args.fchrClmn#chromosome column + txStartClmn = args.txStartClmn#transcript start column + txEndClmn = args.txEndClmn#transcript end column + strandClmn = args.strandClmn#strand column + geneNameClmn = args.geneNameClmn#gene name column + cdsStartClmn = args.cdsStartClmn#coding sequence start column + cdsEndClmn = args.cdsEndClmn#coding sequence end column + startExsClmn = args.startExsClmn#exons start column + endExsClmn = args.endExsClmn#exons end column + + inputCover = args.inputCover + gd_indivs_cover = args.gd_indivs_cover + cvrgTreshold = args.cvrgTreshold + pxchrxCov = args.chrClmnCvrg + pxposCov = args.posClmnCvrg + pxntACov = args.refClmnCvrg + pxntBCov = args.altrClmnCvrg + indvlsPrctTrshld = args.indvlsPrctTrshld + + #print inputCover, gd_indivs_cover, cvrgTreshold + + assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) + + #~ + dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()]) + #~ dPopsinSNPfPos.update({'ref':False}) + #~ + sPopsIntrst=set(dPopsinSNPfPos.keys()) + dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...' + #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile) + #~ + if inputCover and gd_indivs_cover and cvrgTreshold>=0: + dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) + dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) + dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) + rvrse=False + dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} + dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}} + #~ + elif inCDSfile and inUCSCfile: + dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn)#~ print '2. Getting transcripts and exons information...' + #~ + dENSEMBLTChrPosdAlls,dChrPosdPopsAllls=rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit)#~ print '3. Getting fixed alleles in exons...' + #~ + dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...' + #~ + wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) + #~ + + + ## get a lit of output files + files = [] + for dirpath, dirnames, filenames in os.walk(outFastaFold): + for file in filenames: + if file.endswith('.phy'): + files.append( os.path.join(dirpath, file) ) + del dirnames[:] + + if len(files) == 0: + with open(outfile, 'w') as ofh: + print >> ofh, 'No output.' + else: + ## the first file becomes the output + file = files.pop(0) + shutil.move(file, outfile) + + ## rename/move the rest of the files + for i, file in enumerate(files): + new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) + new_pathname = os.path.join(files_dir, new_filename) + shutil.move(file, new_pathname) + + return 0 + +if __name__ == '__main__': + main()