Mercurial > repos > miller-lab > genome_diversity
comparison make_phylip.py @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | |
children | ea52b23f1141 |
comparison
equal
deleted
inserted
replaced
30:4188853b940b | 31:a631c2f6d913 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # mkFastas.py | |
5 # | |
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> | |
7 # | |
8 # This program is free software; you can redistribute it and/or modify | |
9 # it under the terms of the GNU General Public License as published by | |
10 # the Free Software Foundation; either version 2 of the License, or | |
11 # (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program; if not, write to the Free Software | |
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
21 # MA 02110-1301, USA. | |
22 | |
23 import argparse | |
24 import errno | |
25 import os | |
26 import shutil | |
27 | |
28 def mkdir_p(path): | |
29 try: | |
30 os.makedirs(path) | |
31 except OSError, e: | |
32 if e.errno <> errno.EEXIST: | |
33 raise | |
34 | |
35 def revseq(seq): | |
36 seq=list(seq) | |
37 seq.reverse() | |
38 seq=''.join(seq) | |
39 return seq | |
40 | |
41 def revComp(allPop): | |
42 dAllCompAll={'A':'T','T':'A','C':'G','G':'C','N':'N','M':'K','K':'M','R':'Y','Y':'R','W':'W','S':'S'} | |
43 allPopsComp=dAllCompAll[allPop] | |
44 return allPopsComp | |
45 | |
46 def rtrnCons(ntA,ntB): | |
47 srtdPairs=''.join(sorted([ntA,ntB])) | |
48 dpairsCons={'AC':'M', 'AG':'R', 'AT':'W', 'CG':'S', 'CT':'Y', 'GT':'K', 'AN':'A', 'CN':'C', 'GN':'G', 'NT':'T'} | |
49 cons=dpairsCons[srtdPairs] | |
50 return cons | |
51 | |
52 def rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB,fulldChrdPosdPopsAlllsInit=False,cvrgTreshold=False,indvlsPrctTrshld=False): | |
53 """ | |
54 """ | |
55 dChrdPosdPopsAlllsInit={} | |
56 seqref=[] | |
57 for eachl in open(inSNPf,'r'): | |
58 if eachl.strip() and eachl[0]!='#': | |
59 fllInfoSplt=eachl.splitlines()[0].split('\t') | |
60 chrx=fllInfoSplt[pxchrx] | |
61 pos=int(fllInfoSplt[pxpos]) | |
62 ntA=fllInfoSplt[pxntA] | |
63 ntB=fllInfoSplt[pxntB] | |
64 seqref.append([pos,ntA]) | |
65 dPopsAllls={} | |
66 if fulldChrdPosdPopsAlllsInit: | |
67 #~ | |
68 cntIndv=0 | |
69 # | |
70 try: | |
71 fulldPopsAllls=fulldChrdPosdPopsAlllsInit[chrx][pos] | |
72 except: | |
73 fulldPopsAllls=dict([(echPop,ntA) for echPop in dPopsinSNPfPos]) | |
74 # | |
75 for eachPop in dPopsinSNPfPos: | |
76 clmnCvrg=dPopsinSNPfPos[eachPop] | |
77 if clmnCvrg: | |
78 eachPopCvrg=int(fllInfoSplt[clmnCvrg]) | |
79 else: | |
80 #~ eachPopCvrg=0 | |
81 eachPopCvrg=cvrgTreshold | |
82 if eachPopCvrg>=cvrgTreshold: | |
83 dPopsAllls[eachPop]=fulldPopsAllls[eachPop] | |
84 cntIndv+=1 | |
85 else: | |
86 dPopsAllls[eachPop]='N' | |
87 #~ | |
88 if indvlsPrctTrshld>(cntIndv/float(len(dPopsinSNPfPos))): | |
89 dPopsAllls=dict([(echPop,'N') for echPop in dPopsinSNPfPos]) | |
90 #~ | |
91 else: | |
92 for eachPop in dPopsinSNPfPos: | |
93 if dPopsinSNPfPos[eachPop]: | |
94 eachPopAll=int(fllInfoSplt[dPopsinSNPfPos[eachPop]]) | |
95 if eachPopAll==0: | |
96 dPopsAllls[eachPop]=ntB | |
97 elif eachPopAll==2: | |
98 dPopsAllls[eachPop]=ntA | |
99 elif eachPopAll==1: | |
100 dPopsAllls[eachPop]=rtrnCons(ntA,ntB) | |
101 else: | |
102 dPopsAllls[eachPop]='N' | |
103 else: | |
104 dPopsAllls[eachPop]=ntA | |
105 try: | |
106 dChrdPosdPopsAlllsInit[chrx][pos]=dPopsAllls | |
107 except: | |
108 dChrdPosdPopsAlllsInit[chrx]={pos:dPopsAllls} | |
109 #~ | |
110 seqref.sort() | |
111 startExs=[seqref[0][0]] | |
112 endExs=[seqref[-1][0]+1] | |
113 seqref=''.join(x[1] for x in seqref) | |
114 #~ | |
115 return dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs | |
116 | |
117 | |
118 def rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn): | |
119 """ | |
120 """ | |
121 dENSEMBLTchrxStEndEx={} | |
122 dChrdStrtEndENSEMBLT={} | |
123 for eachl in open(inUCSCfile,'r'): | |
124 if eachl.strip(): | |
125 rvrse=False | |
126 allVls=eachl.split('\t') | |
127 txStart=allVls[txStartClmn] | |
128 txEnd=allVls[txEndClmn] | |
129 ENSEMBLT=allVls[geneNameClmn] | |
130 strand=allVls[strandClmn] | |
131 chrx=allVls[fchrClmn] | |
132 if cdsStartClmn and cdsEndClmn: | |
133 cdsStart=allVls[cdsStartClmn] | |
134 cdsEnd=allVls[cdsEndClmn] | |
135 if startExsClmn and endExsClmn: | |
136 startExs=allVls[startExsClmn] | |
137 endExs=allVls[endExsClmn] | |
138 if strand=='-': | |
139 rvrse=True | |
140 try: | |
141 dChrdStrtEndENSEMBLT[chrx][int(txStart),int(txEnd)]=ENSEMBLT | |
142 except: | |
143 try: | |
144 dChrdStrtEndENSEMBLT[chrx]={(int(txStart),int(txEnd)):ENSEMBLT} | |
145 except: | |
146 dChrdStrtEndENSEMBLT={chrx:{(int(txStart),int(txEnd)):ENSEMBLT}} | |
147 #~ | |
148 if cdsStartClmn and cdsEndClmn and startExsClmn and endExsClmn: | |
149 startExs,endExs=rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd) | |
150 else: | |
151 startExs,endExs=[int(txStart)],[int(txEnd)] | |
152 dENSEMBLTchrxStEndEx[ENSEMBLT]=(chrx,startExs,endExs,rvrse) | |
153 #~ | |
154 dENSEMBLTseq={} | |
155 ENSEMBLTseqs=[(x.splitlines()[0],''.join(x.splitlines()[1:])) for x in open(inCDSfile).read().split('>') if x.strip()] | |
156 for ENSEMBLT,seq in ENSEMBLTseqs: | |
157 dENSEMBLTseq[ENSEMBLT]=seq | |
158 #~ | |
159 dENSEMBLTseqChrStEnEx={} | |
160 for ENSEMBLT in dENSEMBLTchrxStEndEx: | |
161 chrx,startExs,endExs,rvrse=dENSEMBLTchrxStEndEx[ENSEMBLT] | |
162 addEseqChrStEnEx=True | |
163 try: | |
164 seq=dENSEMBLTseq[ENSEMBLT] | |
165 if rvrse: | |
166 seq=revseq(seq) | |
167 except: | |
168 addEseqChrStEnEx=False | |
169 if addEseqChrStEnEx: | |
170 dENSEMBLTseqChrStEnEx[ENSEMBLT]=(seq,chrx,startExs,endExs,rvrse) | |
171 return dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT | |
172 | |
173 | |
174 def rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit): | |
175 """ | |
176 """ | |
177 dENSEMBLTChrPosdAlls={} | |
178 dChrPosdPopsAllls={} | |
179 todel=set(dChrdPosdPopsAlllsInit.keys()).difference(set(dChrdStrtEndENSEMBLT.keys())) | |
180 for x in todel: | |
181 x=dChrdPosdPopsAlllsInit.pop(x) | |
182 #--- | |
183 while len(dChrdPosdPopsAlllsInit)>0: | |
184 chrx=dChrdPosdPopsAlllsInit.keys()[0] | |
185 dStrtEndENSEMBLT=dChrdStrtEndENSEMBLT.pop(chrx) | |
186 dPosdPopsAllls=dChrdPosdPopsAlllsInit.pop(chrx) | |
187 #~ | |
188 srtdStrtEndENSEMBLT=sorted(dStrtEndENSEMBLT.keys()) | |
189 srtdPosdPopsAllls=sorted(dPosdPopsAllls.keys()) | |
190 #~ | |
191 pos=srtdPosdPopsAllls.pop(0) | |
192 strt,end=srtdStrtEndENSEMBLT.pop(0) | |
193 ENSEMBLT=dStrtEndENSEMBLT[strt,end] | |
194 dPopsAllls=dPosdPopsAllls[pos] | |
195 keePloop=True | |
196 #~ | |
197 while keePloop: | |
198 if strt<=pos<=end: | |
199 for tmpstrt,tmpend in [(strt,end)]+srtdStrtEndENSEMBLT: | |
200 if tmpstrt<=pos<=tmpend: | |
201 dPopsAllls=dPosdPopsAllls[pos] | |
202 dChrPosdPopsAllls[chrx,pos]=dPopsAllls | |
203 try: | |
204 dENSEMBLTChrPosdAlls[ENSEMBLT][chrx,pos]=dPopsAllls | |
205 except: | |
206 dENSEMBLTChrPosdAlls[ENSEMBLT]={(chrx,pos):dPopsAllls} | |
207 else: | |
208 continue | |
209 if len(srtdPosdPopsAllls)>0: | |
210 pos=srtdPosdPopsAllls.pop(0) | |
211 dPopsAllls=dPosdPopsAllls[pos] | |
212 else: | |
213 keePloop=False | |
214 #~ | |
215 elif pos<=strt: | |
216 if len(srtdPosdPopsAllls)>0: | |
217 pos=srtdPosdPopsAllls.pop(0) | |
218 dPopsAllls=dPosdPopsAllls[pos] | |
219 else: | |
220 keePloop=False | |
221 else: | |
222 if len(srtdStrtEndENSEMBLT)>0: | |
223 strt,end=srtdStrtEndENSEMBLT.pop(0) | |
224 ENSEMBLT=dStrtEndENSEMBLT[strt,end] | |
225 else: | |
226 keePloop=False | |
227 return dENSEMBLTChrPosdAlls,dChrPosdPopsAllls | |
228 | |
229 def rtrnExnStarEndCorrc(startExs,endExs,cdsStart,cdsEnd): | |
230 """ | |
231 """ | |
232 cdsStart,cdsEnd=int(cdsStart),int(cdsEnd) | |
233 crrctdstartExs=set([int(x) for x in startExs.split(',') if x.strip()]) | |
234 crrctdendExs=set([int(x) for x in endExs.split(',') if x.strip()]) | |
235 crrctdstartExs.add(cdsStart) | |
236 crrctdendExs.add(cdsEnd) | |
237 sStartDel=set() | |
238 sEndDel=set() | |
239 #~ | |
240 for echvl in crrctdstartExs: | |
241 if echvl<cdsStart or echvl>cdsEnd: | |
242 sStartDel.add(echvl) | |
243 #~ | |
244 for echvl in crrctdendExs: | |
245 if echvl<cdsStart or echvl>cdsEnd: | |
246 sEndDel.add(echvl) | |
247 #~ | |
248 return sorted(crrctdstartExs.difference(sStartDel)),sorted(crrctdendExs.difference(sEndDel)) | |
249 | |
250 def rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT): | |
251 """ | |
252 """ | |
253 exnIntrvl=zip(startExs,endExs) | |
254 CDSinitPos=exnIntrvl[0][0] | |
255 dexnIntrvlSeq={} | |
256 for exStart,exEnd in exnIntrvl: | |
257 lenEx=exEnd-exStart | |
258 dexnIntrvlSeq[exStart,exEnd]=seq[:lenEx] | |
259 seq=seq[lenEx:] | |
260 | |
261 ldexnIntrvlSeq=len(dexnIntrvlSeq) | |
262 #~ | |
263 dPopsFasta={} | |
264 #~ | |
265 strePos=set() | |
266 dStrePosAbsPos={} | |
267 tmpAcmltdPos=0 | |
268 #~ | |
269 exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0] | |
270 seq=dexnIntrvlSeq.pop((exStart,exEnd)) | |
271 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] | |
272 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) | |
273 tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls]) | |
274 cntExns=0 | |
275 while True: | |
276 if exStart<=pos<=exEnd-1: | |
277 relPos=tmpAcmltdPos+pos-exStart | |
278 strePos.add(relPos) | |
279 dStrePosAbsPos[relPos]=pos | |
280 for echPop in tmpdPopsFasta: | |
281 allPop=dPopsAllls[echPop] | |
282 if rvrse: | |
283 allPop=revComp(allPop) | |
284 tmpdPopsFasta[echPop][pos-exStart]=allPop | |
285 if len(dChrPosdPopsAllls)>0: | |
286 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] | |
287 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) | |
288 else: | |
289 pos=endExs[-1]+100#max pos of exns | |
290 elif pos<exStart: | |
291 if len(dChrPosdPopsAllls)>0: | |
292 chrx,pos=sorted(dChrPosdPopsAllls.keys())[0] | |
293 dPopsAllls=dChrPosdPopsAllls.pop((chrx,pos)) | |
294 else: | |
295 pos=endExs[-1]+100#max pos of exns | |
296 elif pos>exEnd-1:# or len(dChrPosdPopsAllls)==0: | |
297 for echPop in tmpdPopsFasta: | |
298 try: | |
299 dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop]) | |
300 except: | |
301 dPopsFasta[echPop]=''.join(tmpdPopsFasta[echPop]) | |
302 cntExns+=1 | |
303 tmpAcmltdPos+=len(seq) | |
304 if len(dexnIntrvlSeq)>0: | |
305 exStart,exEnd=sorted(dexnIntrvlSeq.keys())[0] | |
306 seq=dexnIntrvlSeq.pop((exStart,exEnd)) | |
307 tmpdPopsFasta=dict([(x,list(seq)) for x in dPopsAllls]) | |
308 else: | |
309 break | |
310 if ldexnIntrvlSeq!=cntExns: | |
311 for echPop in tmpdPopsFasta: | |
312 dPopsFasta[echPop]+=''.join(tmpdPopsFasta[echPop]) | |
313 #~ | |
314 lchrStartexEndpos=[] | |
315 if rvrse: | |
316 dPopsFasta=dict([(echPop,revseq(dPopsFasta[echPop])) for echPop in dPopsFasta])#[echPop]+=''.join(tmpdPopsFasta[echPop]) | |
317 for ePos in strePos: | |
318 lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(tmpAcmltdPos-ePos-1),str(dStrePosAbsPos[ePos])])) | |
319 else: | |
320 for ePos in strePos: | |
321 lchrStartexEndpos.append('\t'.join([ENSEMBLT,chrx,str(ePos),str(dStrePosAbsPos[ePos])])) | |
322 #~ | |
323 return dPopsFasta,lchrStartexEndpos | |
324 | |
325 def rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls): | |
326 """ | |
327 """ | |
328 dENSEMBLTPopsFasta={} | |
329 lchrStartexEndposAll=[] | |
330 #~ | |
331 sENSEMBLTcmmn=set(dENSEMBLTChrPosdAlls.keys()).intersection(set(dENSEMBLTseqChrStEnEx.keys()))#sENSEMBLTcmmn between UCSC and ENSEMBLE | |
332 #~ | |
333 for ENSEMBLT in sENSEMBLTcmmn: | |
334 seq,chrx,startExs,endExs,rvrse=dENSEMBLTseqChrStEnEx[ENSEMBLT] | |
335 dChrPosdPopsAllls=dENSEMBLTChrPosdAlls[ENSEMBLT] | |
336 if len(startExs)>0 and len(endExs)>0: | |
337 dPopsFasta,lchrStartexEndpos=rtrndPopsFasta(seq,chrx,startExs,endExs,rvrse,dChrPosdPopsAllls,ENSEMBLT) | |
338 lchrStartexEndposAll.extend(lchrStartexEndpos) | |
339 if dPopsFasta:#to correct a bug of the input table, in cases in which endExons<startExn (!). See ENSCAFT00000000145 (MC4R) in canFam2 for example. | |
340 dENSEMBLTPopsFasta[ENSEMBLT]=dPopsFasta | |
341 return dENSEMBLTPopsFasta,lchrStartexEndposAll | |
342 | |
343 | |
344 | |
345 def rtrnPhy(dPopsFasta,ENSEMBLT): | |
346 """ | |
347 """ | |
348 dPopsFormPhy={} | |
349 for eachPop in dPopsFasta: | |
350 hader='%s'%eachPop | |
351 #~ hader='>%s'%eachPop | |
352 seq=dPopsFasta[eachPop] | |
353 formtd='\t'.join([hader,seq]) | |
354 #~ formtd='\n'.join([hader,seq]) | |
355 dPopsFormPhy[eachPop]=formtd | |
356 #~ | |
357 return dPopsFormPhy,len(seq) | |
358 | |
359 def wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst): | |
360 """ | |
361 """ | |
362 ENSEMBLTKaKs=[] | |
363 nonHeader=True | |
364 cnt=0 | |
365 lENSEMBLT=len(dENSEMBLTPopsFasta) | |
366 #~ | |
367 for ENSEMBLT in sorted(dENSEMBLTPopsFasta.keys()): | |
368 cnt+=1 | |
369 dPopsFasta=dENSEMBLTPopsFasta[ENSEMBLT] | |
370 dPopsFormPhy,lenseq=rtrnPhy(dPopsFasta,ENSEMBLT) | |
371 #~ | |
372 seqPMLformat=['%s %s'%(len(dPopsFormPhy),lenseq)]#generate new PHYML sequence | |
373 #~ seqPMLformat=[]#generate new PHYML sequence | |
374 for namex in sorted(sPopsIntrst): | |
375 seqPMLformat.append(dPopsFormPhy[namex]) | |
376 #~ | |
377 mkdir_p(outFastaFold) | |
378 outFastaf=os.path.join(outFastaFold,'%s.phy'%ENSEMBLT) | |
379 outFastaf=open(outFastaf,'w') | |
380 outFastaf.write('\n'.join(seqPMLformat)) | |
381 outFastaf.close() | |
382 #~ | |
383 return 0 | |
384 | |
385 def main(): | |
386 #~ | |
387 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 | |
388 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') | |
389 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) | |
390 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) | |
391 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) | |
392 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) | |
393 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True) | |
394 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True) | |
395 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True) | |
396 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True) | |
397 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) | |
398 #~ | |
399 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) | |
400 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) | |
401 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) | |
402 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) | |
403 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False) | |
404 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False) | |
405 parser.add_argument('--altrClmnCvrg',metavar='int',type=int,help='the column with the derived nucleotide in the input coverage file.',required=False,default=False) | |
406 parser.add_argument('--indvlsPrctTrshld',metavar='int',type=float,help='the percentage of individual above which nucleotides are included, else "N".',required=False,default=False) | |
407 #~ | |
408 parser.add_argument('--sequence',metavar='input fasta file',type=str,help='the input file with the sequence whose SNPs are in the input.',required=False,default=False) | |
409 parser.add_argument('--gene_info',metavar='input interval file',type=str,help='the input interval file with the the information on the genes.',required=False,default=False) | |
410 parser.add_argument('--fchrClmn',metavar='int',type=int,help='the column with the chromosome in the gene_info file.',required=False,default=False) | |
411 parser.add_argument('--txStartClmn',metavar='int',type=int,help='the column with the transcript start column in the gene_info file.',required=False,default=False) | |
412 parser.add_argument('--txEndClmn',metavar='int',type=int,help='the column with the transcript end column in the gene_info file.',required=False,default=False) | |
413 parser.add_argument('--strandClmn',metavar='int',type=int,help='the column with the strand column in the gene_info file.',required=False,default=False) | |
414 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False) | |
415 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False) | |
416 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) | |
417 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) | |
418 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) | |
419 | |
420 args = parser.parse_args() | |
421 | |
422 inSNPf = args.input | |
423 outfile = args.output | |
424 outfile_id = args.output_id | |
425 outFastaFold = './out' | |
426 files_dir = args.output_dir | |
427 gd_indivs = args.gd_indivs | |
428 pxchrx = args.chrClmn | |
429 pxpos = args.posClmn | |
430 pxntA = args.refClmn | |
431 pxntB = args.altrClmn | |
432 | |
433 | |
434 inCDSfile = args.sequence | |
435 inUCSCfile = args.gene_info | |
436 fchrClmn = args.fchrClmn#chromosome column | |
437 txStartClmn = args.txStartClmn#transcript start column | |
438 txEndClmn = args.txEndClmn#transcript end column | |
439 strandClmn = args.strandClmn#strand column | |
440 geneNameClmn = args.geneNameClmn#gene name column | |
441 cdsStartClmn = args.cdsStartClmn#coding sequence start column | |
442 cdsEndClmn = args.cdsEndClmn#coding sequence end column | |
443 startExsClmn = args.startExsClmn#exons start column | |
444 endExsClmn = args.endExsClmn#exons end column | |
445 | |
446 inputCover = args.inputCover | |
447 gd_indivs_cover = args.gd_indivs_cover | |
448 cvrgTreshold = args.cvrgTreshold | |
449 pxchrxCov = args.chrClmnCvrg | |
450 pxposCov = args.posClmnCvrg | |
451 pxntACov = args.refClmnCvrg | |
452 pxntBCov = args.altrClmnCvrg | |
453 indvlsPrctTrshld = args.indvlsPrctTrshld | |
454 | |
455 #print inputCover, gd_indivs_cover, cvrgTreshold | |
456 | |
457 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) | |
458 | |
459 #~ | |
460 dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()]) | |
461 #~ dPopsinSNPfPos.update({'ref':False}) | |
462 #~ | |
463 sPopsIntrst=set(dPopsinSNPfPos.keys()) | |
464 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...' | |
465 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile) | |
466 #~ | |
467 if inputCover and gd_indivs_cover and cvrgTreshold>=0: | |
468 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) | |
469 dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) | |
470 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) | |
471 rvrse=False | |
472 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} | |
473 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}} | |
474 #~ | |
475 elif inCDSfile and inUCSCfile: | |
476 dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile,fchrClmn,txStartClmn,txEndClmn,strandClmn,geneNameClmn,startExsClmn,endExsClmn,cdsStartClmn,cdsEndClmn)#~ print '2. Getting transcripts and exons information...' | |
477 #~ | |
478 dENSEMBLTChrPosdAlls,dChrPosdPopsAllls=rtrnFxdChrPosinCodReg(dChrdStrtEndENSEMBLT,dChrdPosdPopsAlllsInit)#~ print '3. Getting fixed alleles in exons...' | |
479 #~ | |
480 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...' | |
481 #~ | |
482 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) | |
483 #~ | |
484 | |
485 | |
486 ## get a lit of output files | |
487 files = [] | |
488 for dirpath, dirnames, filenames in os.walk(outFastaFold): | |
489 for file in filenames: | |
490 if file.endswith('.phy'): | |
491 files.append( os.path.join(dirpath, file) ) | |
492 del dirnames[:] | |
493 | |
494 if len(files) == 0: | |
495 with open(outfile, 'w') as ofh: | |
496 print >> ofh, 'No output.' | |
497 else: | |
498 ## the first file becomes the output | |
499 file = files.pop(0) | |
500 shutil.move(file, outfile) | |
501 | |
502 ## rename/move the rest of the files | |
503 for i, file in enumerate(files): | |
504 new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) | |
505 new_pathname = os.path.join(files_dir, new_filename) | |
506 shutil.move(file, new_pathname) | |
507 | |
508 return 0 | |
509 | |
510 if __name__ == '__main__': | |
511 main() |