Mercurial > repos > miller-lab > genome_diversity
diff make_phylip.py @ 35:ea52b23f1141
Bug fixes for Draw variants, Phylip, and gd_d_tools
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 20 Nov 2013 13:46:10 -0500 |
parents | a631c2f6d913 |
children | 51cd0307fb70 |
line wrap: on
line diff
--- a/make_phylip.py Mon Sep 23 13:37:19 2013 -0400 +++ b/make_phylip.py Wed Nov 20 13:46:10 2013 -0500 @@ -24,6 +24,7 @@ import errno import os import shutil +from Population import Population def mkdir_p(path): try: @@ -382,11 +383,29 @@ #~ return 0 +def pos_dict(gd_indivs_file, input_type): + rv = {} + + p = Population() + p.from_population_file(gd_indivs_file) + + for tag in p.tag_list(): + column, name = tag.split(':') + column = int(column) - 1 + + if input_type == 'gd_genotype': + column -= 2 + + rv[name] = column + + return rv + def main(): #~ #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) + parser.add_argument('--input_type',metavar='input type',type=str,help='the input file type (gd_snp or gd_genotype)',required=True) parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) @@ -397,6 +416,7 @@ parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) #~ parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) + parser.add_argument('--inputCover_type',metavar='input cover type',type=str,help='the cover input file type (gd_snp or gd_genotype)',required=False,default=False) parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) @@ -416,10 +436,11 @@ parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) - + args = parser.parse_args() inSNPf = args.input + inSNPf_type = args.input_type outfile = args.output outfile_id = args.output_id outFastaFold = './out' @@ -444,6 +465,7 @@ endExsClmn = args.endExsClmn#exons end column inputCover = args.inputCover + inputCover_type = args.inputCover_type gd_indivs_cover = args.gd_indivs_cover cvrgTreshold = args.cvrgTreshold pxchrxCov = args.chrClmnCvrg @@ -451,13 +473,13 @@ pxntACov = args.refClmnCvrg pxntBCov = args.altrClmnCvrg indvlsPrctTrshld = args.indvlsPrctTrshld - + + #print inputCover, gd_indivs_cover, cvrgTreshold assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) - - #~ - dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()]) + + dPopsinSNPfPos = pos_dict(gd_indivs, inSNPf_type) #~ dPopsinSNPfPos.update({'ref':False}) #~ sPopsIntrst=set(dPopsinSNPfPos.keys()) @@ -466,7 +488,7 @@ #~ if inputCover and gd_indivs_cover and cvrgTreshold>=0: dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) - dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) + dPopsinSNPfPos_cover.update(pos_dict(gd_indivs_cover, inputCover_type)) dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) rvrse=False dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} @@ -482,8 +504,7 @@ wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) #~ - - ## get a lit of output files + ## get a list of output files files = [] for dirpath, dirnames, filenames in os.walk(outFastaFold): for file in filenames: @@ -500,8 +521,10 @@ shutil.move(file, outfile) ## rename/move the rest of the files - for i, file in enumerate(files): - new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) + for file in files: + name = os.path.basename(file)[:-4] + name = name.replace('_', '-') + new_filename = 'primary_{0}_{1}_visible_txt_?'.format(outfile_id, name) new_pathname = os.path.join(files_dir, new_filename) shutil.move(file, new_pathname)