Mercurial > repos > miller-lab > genome_diversity
comparison make_phylip.py @ 35:ea52b23f1141
Bug fixes for Draw variants, Phylip, and gd_d_tools
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 20 Nov 2013 13:46:10 -0500 |
parents | a631c2f6d913 |
children | 51cd0307fb70 |
comparison
equal
deleted
inserted
replaced
34:f739a296a339 | 35:ea52b23f1141 |
---|---|
22 | 22 |
23 import argparse | 23 import argparse |
24 import errno | 24 import errno |
25 import os | 25 import os |
26 import shutil | 26 import shutil |
27 from Population import Population | |
27 | 28 |
28 def mkdir_p(path): | 29 def mkdir_p(path): |
29 try: | 30 try: |
30 os.makedirs(path) | 31 os.makedirs(path) |
31 except OSError, e: | 32 except OSError, e: |
380 outFastaf.write('\n'.join(seqPMLformat)) | 381 outFastaf.write('\n'.join(seqPMLformat)) |
381 outFastaf.close() | 382 outFastaf.close() |
382 #~ | 383 #~ |
383 return 0 | 384 return 0 |
384 | 385 |
386 def pos_dict(gd_indivs_file, input_type): | |
387 rv = {} | |
388 | |
389 p = Population() | |
390 p.from_population_file(gd_indivs_file) | |
391 | |
392 for tag in p.tag_list(): | |
393 column, name = tag.split(':') | |
394 column = int(column) - 1 | |
395 | |
396 if input_type == 'gd_genotype': | |
397 column -= 2 | |
398 | |
399 rv[name] = column | |
400 | |
401 return rv | |
402 | |
385 def main(): | 403 def main(): |
386 #~ | 404 #~ |
387 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 | 405 #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 |
388 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') | 406 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') |
389 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) | 407 parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) |
408 parser.add_argument('--input_type',metavar='input type',type=str,help='the input file type (gd_snp or gd_genotype)',required=True) | |
390 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) | 409 parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) |
391 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) | 410 parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) |
392 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) | 411 parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) |
393 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True) | 412 parser.add_argument('--altrClmn',metavar='int',type=int,help='the column with the derived nucleotide.',required=True) |
394 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True) | 413 parser.add_argument('--output',metavar='output',type=str,help='the output',required=True) |
395 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True) | 414 parser.add_argument('--output_id',metavar='int',type=int,help='the output id',required=True) |
396 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True) | 415 parser.add_argument('--output_dir',metavar='output folder sequences',type=str,help='the output folder with the sequences.',required=True) |
397 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) | 416 parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) |
398 #~ | 417 #~ |
399 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) | 418 parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) |
419 parser.add_argument('--inputCover_type',metavar='input cover type',type=str,help='the cover input file type (gd_snp or gd_genotype)',required=False,default=False) | |
400 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) | 420 parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) |
401 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) | 421 parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) |
402 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) | 422 parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) |
403 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False) | 423 parser.add_argument('--posClmnCvrg',metavar='int',type=int,help='the column with the SNPs position in the input coverage file.',required=False,default=False) |
404 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False) | 424 parser.add_argument('--refClmnCvrg',metavar='int',type=int,help='the column with the reference nucleotide in the input coverage file.',required=False,default=False) |
414 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False) | 434 parser.add_argument('--geneNameClmn',metavar='int',type=int,help='the column with the gene name column in the gene_info file.',required=False,default=False) |
415 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False) | 435 parser.add_argument('--cdsStartClmn',metavar='int',type=int,help='the column with the coding start column in the gene_info file.',required=False,default=False) |
416 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) | 436 parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) |
417 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) | 437 parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) |
418 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) | 438 parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) |
419 | 439 |
420 args = parser.parse_args() | 440 args = parser.parse_args() |
421 | 441 |
422 inSNPf = args.input | 442 inSNPf = args.input |
443 inSNPf_type = args.input_type | |
423 outfile = args.output | 444 outfile = args.output |
424 outfile_id = args.output_id | 445 outfile_id = args.output_id |
425 outFastaFold = './out' | 446 outFastaFold = './out' |
426 files_dir = args.output_dir | 447 files_dir = args.output_dir |
427 gd_indivs = args.gd_indivs | 448 gd_indivs = args.gd_indivs |
442 cdsEndClmn = args.cdsEndClmn#coding sequence end column | 463 cdsEndClmn = args.cdsEndClmn#coding sequence end column |
443 startExsClmn = args.startExsClmn#exons start column | 464 startExsClmn = args.startExsClmn#exons start column |
444 endExsClmn = args.endExsClmn#exons end column | 465 endExsClmn = args.endExsClmn#exons end column |
445 | 466 |
446 inputCover = args.inputCover | 467 inputCover = args.inputCover |
468 inputCover_type = args.inputCover_type | |
447 gd_indivs_cover = args.gd_indivs_cover | 469 gd_indivs_cover = args.gd_indivs_cover |
448 cvrgTreshold = args.cvrgTreshold | 470 cvrgTreshold = args.cvrgTreshold |
449 pxchrxCov = args.chrClmnCvrg | 471 pxchrxCov = args.chrClmnCvrg |
450 pxposCov = args.posClmnCvrg | 472 pxposCov = args.posClmnCvrg |
451 pxntACov = args.refClmnCvrg | 473 pxntACov = args.refClmnCvrg |
452 pxntBCov = args.altrClmnCvrg | 474 pxntBCov = args.altrClmnCvrg |
453 indvlsPrctTrshld = args.indvlsPrctTrshld | 475 indvlsPrctTrshld = args.indvlsPrctTrshld |
454 | 476 |
477 | |
455 #print inputCover, gd_indivs_cover, cvrgTreshold | 478 #print inputCover, gd_indivs_cover, cvrgTreshold |
456 | 479 |
457 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) | 480 assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) |
458 | 481 |
459 #~ | 482 dPopsinSNPfPos = pos_dict(gd_indivs, inSNPf_type) |
460 dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()]) | |
461 #~ dPopsinSNPfPos.update({'ref':False}) | 483 #~ dPopsinSNPfPos.update({'ref':False}) |
462 #~ | 484 #~ |
463 sPopsIntrst=set(dPopsinSNPfPos.keys()) | 485 sPopsIntrst=set(dPopsinSNPfPos.keys()) |
464 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...' | 486 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inSNPf,dPopsinSNPfPos,pxchrx,pxpos,pxntA,pxntB)#~ print '1. Getting fixed alleles information...' |
465 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile) | 487 #~ dENSEMBLTseqChrStEnEx,dChrdStrtEndENSEMBLT=rtrndENSEMBLTseq(inCDSfile,inUCSCfile) |
466 #~ | 488 #~ |
467 if inputCover and gd_indivs_cover and cvrgTreshold>=0: | 489 if inputCover and gd_indivs_cover and cvrgTreshold>=0: |
468 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) | 490 dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) |
469 dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) | 491 dPopsinSNPfPos_cover.update(pos_dict(gd_indivs_cover, inputCover_type)) |
470 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) | 492 dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) |
471 rvrse=False | 493 rvrse=False |
472 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} | 494 dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} |
473 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}} | 495 dChrdStrtEndENSEMBLT={chrx:{(startExs[0],endExs[0]):'tmp'}} |
474 #~ | 496 #~ |
480 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...' | 502 dENSEMBLTPopsFasta,lchrStartexEndposAll=rtrnSeqVars(dENSEMBLTseqChrStEnEx,dENSEMBLTChrPosdAlls)#~ print '4. Getting fasta sequences of populations...' |
481 #~ | 503 #~ |
482 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) | 504 wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) |
483 #~ | 505 #~ |
484 | 506 |
485 | 507 ## get a list of output files |
486 ## get a lit of output files | |
487 files = [] | 508 files = [] |
488 for dirpath, dirnames, filenames in os.walk(outFastaFold): | 509 for dirpath, dirnames, filenames in os.walk(outFastaFold): |
489 for file in filenames: | 510 for file in filenames: |
490 if file.endswith('.phy'): | 511 if file.endswith('.phy'): |
491 files.append( os.path.join(dirpath, file) ) | 512 files.append( os.path.join(dirpath, file) ) |
498 ## the first file becomes the output | 519 ## the first file becomes the output |
499 file = files.pop(0) | 520 file = files.pop(0) |
500 shutil.move(file, outfile) | 521 shutil.move(file, outfile) |
501 | 522 |
502 ## rename/move the rest of the files | 523 ## rename/move the rest of the files |
503 for i, file in enumerate(files): | 524 for file in files: |
504 new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) | 525 name = os.path.basename(file)[:-4] |
526 name = name.replace('_', '-') | |
527 new_filename = 'primary_{0}_{1}_visible_txt_?'.format(outfile_id, name) | |
505 new_pathname = os.path.join(files_dir, new_filename) | 528 new_pathname = os.path.join(files_dir, new_filename) |
506 shutil.move(file, new_pathname) | 529 shutil.move(file, new_pathname) |
507 | 530 |
508 return 0 | 531 return 0 |
509 | 532 |