Mercurial > repos > miller-lab > genome_diversity
changeset 35:ea52b23f1141
Bug fixes for Draw variants, Phylip, and gd_d_tools
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 20 Nov 2013 13:46:10 -0500 |
parents | f739a296a339 |
children | 51cd0307fb70 |
files | draw_variants.py make_phylip.py make_phylip.xml make_phylip_hooks.py reorder.py specify.py tool_dependencies.xml |
diffstat | 5 files changed, 95 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/draw_variants.py Mon Sep 23 13:37:19 2013 -0400 +++ b/draw_variants.py Wed Nov 20 13:46:10 2013 -0500 @@ -105,8 +105,12 @@ args = [ prog ] args.append('-density') -args.append(100) +args.append(200) +args.append('-resize') +args.append('140%') args.append('Ji.svg') +args.append('-compress') +args.append('zip') args.append('tiff:{0}'.format(output)) gd_util.run_program(prog, args)
--- a/make_phylip.py Mon Sep 23 13:37:19 2013 -0400 +++ b/make_phylip.py Wed Nov 20 13:46:10 2013 -0500 @@ -24,6 +24,7 @@ import errno import os import shutil +from Population import Population def mkdir_p(path): try: @@ -382,11 +383,29 @@ #~ return 0 +def pos_dict(gd_indivs_file, input_type): + rv = {} + + p = Population() + p.from_population_file(gd_indivs_file) + + for tag in p.tag_list(): + column, name = tag.split(':') + column = int(column) - 1 + + if input_type == 'gd_genotype': + column -= 2 + + rv[name] = column + + return rv + def main(): #~ #~bpython mkPhyl.py --input=colugo_mt_Galaxy_genotypes.txt --chrClmn=0 --posClmn=1 --refClmn=2 --altrClmn=3 --output=out.d --gd_indivs=genotypes.gd_indivs --inputCover=colugo_mt_Galaxy_coverage.txt --gd_indivs_cover=coverage.gd_indivs --cvrgTreshold=0 --chrClmnCvrg=0 --posClmnCvrg=1 --refClmnCvrg=2 --altrClmnCvrg=3 --indvlsPrctTrshld=0 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).') parser.add_argument('--input',metavar='input gd_snp file',type=str,help='the input file with the table in gd_snp/gd_genotype format.',required=True) + parser.add_argument('--input_type',metavar='input type',type=str,help='the input file type (gd_snp or gd_genotype)',required=True) parser.add_argument('--chrClmn',metavar='int',type=int,help='the column with the chromosome.',required=True) parser.add_argument('--posClmn',metavar='int',type=int,help='the column with the SNPs position.',required=True) parser.add_argument('--refClmn',metavar='int',type=int,help='the column with the reference nucleotide.',required=True) @@ -397,6 +416,7 @@ parser.add_argument('--gd_indivs',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input file.',required=True) #~ parser.add_argument('--inputCover',metavar='input gd_snp cover file',type=str,help='the input file with the table in gd_snp/gd_genotype cover format.',required=False,default=False) + parser.add_argument('--inputCover_type',metavar='input cover type',type=str,help='the cover input file type (gd_snp or gd_genotype)',required=False,default=False) parser.add_argument('--gd_indivs_cover',metavar='input gd_indivs file',type=str,help='the input reference species columns in the input cover file.',required=False,default=False) parser.add_argument('--cvrgTreshold',metavar='input coverage threshold',type=int,help='the coverage threshold above which nucleotides are included, else "N".',required=False,default=False) parser.add_argument('--chrClmnCvrg',metavar='int',type=int,help='the column with the chromosome in the input coverage file.',required=False,default=False) @@ -416,10 +436,11 @@ parser.add_argument('--cdsEndClmn',metavar='int',type=int,help='the column with the coding end column in the gene_info file.',required=False,default=False) parser.add_argument('--startExsClmn',metavar='int',type=int,help='the column with the exon start positions column in the gene_info file.',required=False,default=False) parser.add_argument('--endExsClmn',metavar='int',type=int,help='the column with the exon end positions column in the gene_info file.',required=False,default=False) - + args = parser.parse_args() inSNPf = args.input + inSNPf_type = args.input_type outfile = args.output outfile_id = args.output_id outFastaFold = './out' @@ -444,6 +465,7 @@ endExsClmn = args.endExsClmn#exons end column inputCover = args.inputCover + inputCover_type = args.inputCover_type gd_indivs_cover = args.gd_indivs_cover cvrgTreshold = args.cvrgTreshold pxchrxCov = args.chrClmnCvrg @@ -451,13 +473,13 @@ pxntACov = args.refClmnCvrg pxntBCov = args.altrClmnCvrg indvlsPrctTrshld = args.indvlsPrctTrshld - + + #print inputCover, gd_indivs_cover, cvrgTreshold assert ((inputCover and gd_indivs_cover and cvrgTreshold>=0 and indvlsPrctTrshld>=0) or (inCDSfile and inUCSCfile)) - - #~ - dPopsinSNPfPos=dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs).read().splitlines() if x.strip()]) + + dPopsinSNPfPos = pos_dict(gd_indivs, inSNPf_type) #~ dPopsinSNPfPos.update({'ref':False}) #~ sPopsIntrst=set(dPopsinSNPfPos.keys()) @@ -466,7 +488,7 @@ #~ if inputCover and gd_indivs_cover and cvrgTreshold>=0: dPopsinSNPfPos_cover=dict([(eachPop,False) for eachPop in dPopsinSNPfPos.keys()]) - dPopsinSNPfPos_cover.update(dict([(x.split()[1],int(x.split()[0])-1) for x in open(gd_indivs_cover).read().splitlines() if x.strip()])) + dPopsinSNPfPos_cover.update(pos_dict(gd_indivs_cover, inputCover_type)) dChrdPosdPopsAlllsInit,seqref,chrx,startExs,endExs=rtrnFxdChrPos(inputCover,dPopsinSNPfPos_cover,pxchrxCov,pxposCov,pxntACov,pxntBCov,dChrdPosdPopsAlllsInit,cvrgTreshold,indvlsPrctTrshld) rvrse=False dENSEMBLTseqChrStEnEx={'tmp':(seqref,chrx,startExs,endExs,rvrse)} @@ -482,8 +504,7 @@ wrapSeqsFasta(dENSEMBLTPopsFasta,outFastaFold,sPopsIntrst) #~ - - ## get a lit of output files + ## get a list of output files files = [] for dirpath, dirnames, filenames in os.walk(outFastaFold): for file in filenames: @@ -500,8 +521,10 @@ shutil.move(file, outfile) ## rename/move the rest of the files - for i, file in enumerate(files): - new_filename = 'primary_{0}_output{1}_visible_txt_?'.format(outfile_id, i+2) + for file in files: + name = os.path.basename(file)[:-4] + name = name.replace('_', '-') + new_filename = 'primary_{0}_{1}_visible_txt_?'.format(outfile_id, name) new_pathname = os.path.join(files_dir, new_filename) shutil.move(file, new_pathname)
--- a/make_phylip.xml Mon Sep 23 13:37:19 2013 -0400 +++ b/make_phylip.xml Wed Nov 20 13:46:10 2013 -0500 @@ -1,19 +1,19 @@ -<tool id="gd_make_phylip" name="Phylip" version="1.0.0" force_history_refresh="True"> +<tool id="gd_make_phylip" name="Phylip" version="1.1.0" force_history_refresh="True"> <description>: prepare data for phylogenetic analysis</description> <command interpreter="python"> #set $zero_based = 1 - #set $gen_chrClmn = int($input.metadata.scaffold) - $zero_based - #set $gen_posClmn = int($input.metadata.pos) - $zero_based + #set $gen_chrClmn = int($input.metadata.ref) - $zero_based + #set $gen_posClmn = int($input.metadata.rPos) - $zero_based #set $gen_refClmn = int($input.metadata.pos) - $zero_based + 1 #set $gen_altrClmn = int($input.metadata.pos) - $zero_based + 2 - make_phylip.py '--altrClmn=$gen_altrClmn' '--chrClmn=$gen_chrClmn' '--gd_indivs=$indivs_input' '--input=$input' '--output=$output1' '--output_id=$output1.id' '--output_dir=$__new_file_path__' '--posClmn=$gen_posClmn' '--refClmn=$gen_refClmn' + make_phylip.py '--altrClmn=$gen_altrClmn' '--chrClmn=$gen_chrClmn' '--gd_indivs=$indivs_input' '--input=$input' '--input_type=$input.ext' '--output=$output1' '--output_id=$output1.id' '--output_dir=$__new_file_path__' '--posClmn=$gen_posClmn' '--refClmn=$gen_refClmn' #if $input_type.choice == '0' - #set $cov_chrClmn = int($input_type.coverage_input.metadata.scaffold) - $zero_based - #set $cov_posClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + #set $cov_chrClmn = int($input_type.coverage_input.metadata.ref) - $zero_based + #set $cov_posClmn = int($input_type.coverage_input.metadata.rPos) - $zero_based #set $cov_refClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 1 #set $cov_altrClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 2 - '--altrClmnCvrg=$cov_altrClmn' '--chrClmnCvrg=$cov_chrClmn' '--cvrgTreshold=$input_type.coverage_threshold' '--gd_indivs_cover=$indivs_input' '--indvlsPrctTrshld=$input_type.indivs_threshold' '--inputCover=$input_type.coverage_input' '--posClmnCvrg=$cov_posClmn' '--refClmnCvrg=$cov_refClmn' + '--altrClmnCvrg=$cov_altrClmn' '--chrClmnCvrg=$cov_chrClmn' '--cvrgTreshold=$input_type.coverage_threshold' '--gd_indivs_cover=$indivs_input' '--indvlsPrctTrshld=$input_type.indivs_threshold' '--inputCover=$input_type.coverage_input' '--inputCover_type=$input_type.coverage_input.ext' '--posClmnCvrg=$cov_posClmn' '--refClmnCvrg=$cov_refClmn' #else if $input_type.choice == '1' #set $fchrClmn = int($input_type.annotation_input.metadata.chromCol) - $zero_based #set $strandClmn = int($input_type.annotation_input.metadata.strandCol) - $zero_based @@ -28,6 +28,8 @@ #end if </command> + <code file="make_phylip_hooks.py" /> + <inputs> <param name="input" type="data" format="gd_genotype,gd_snp" label="Genotype/SNP dataset"> <validator type="metadata" check="scaffold" message="scaffold missing" />
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_phylip_hooks.py Wed Nov 20 13:46:10 2013 -0500 @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +def exec_before_job(app, inp_data=None, out_data=None, tool=None, param_dict=None): + pass + +def exec_after_process(app, inp_data=None, out_data=None, param_dict=None, tool=None, stdout=None, stderr=None): + output_name = 'output1' + + ## check for output + try: + first_output = out_data[output_name] + except: + return + + ## check for collected datasets + try: + collected_dict = param_dict['__collected_datasets__']['primary'][output_name] + except: + return + + if len(collected_dict.keys()) == 0: + return + + ## check for fasta file + try: + fasta_file = inp_data['fasta_input'] + except: + return + + ## find missing fasta header + first_output_name = None + with open(fasta_file.get_file_name()) as fh: + for line in fh: + if line[0] != '>': + continue + name = line[1:] + name = name.strip() + name = name.split()[0] + name = name.replace('_', '-') + if name not in collected_dict: + first_output_name = name + break + + ## fix name + if first_output_name is not None: + first_output.name = '%s (%s)' % (first_output.name, first_output_name) +
--- a/tool_dependencies.xml Mon Sep 23 13:37:19 2013 -0400 +++ b/tool_dependencies.xml Wed Nov 20 13:46:10 2013 -0500 @@ -12,7 +12,7 @@ <repository prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu/" owner="miller-lab" name="package_fisher_0_1_4" changeset_revision="c84c287b81a4" /> </package> <package name="gd_c_tools" version="0.1"> - <repository prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu/" owner="miller-lab" name="package_gd_c_tools_0_1" changeset_revision="7361ee4b5f40" /> + <repository prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu/" owner="miller-lab" name="package_gd_c_tools_0_1" changeset_revision="989859342136" /> </package> <package name="matplotlib" version="1.2.1"> <repository prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu/" owner="iuc" name="package_matplotlib_1_2" changeset_revision="9d164359606b" />