diff gd_snp2vcf.xml @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gd_snp2vcf.xml	Fri Sep 20 13:25:27 2013 -0400
@@ -0,0 +1,155 @@
+<tool id="gd_gd_snp2vcf" name="gd_snp to VCF" version="1.1.0" force_history_refresh="True">
+  <description>: Convert from gd_snp or gd_genotype to VCF format, for submission to dbSNP</description>
+
+  <command interpreter="perl">
+    gd_snp2vcf.pl "$input" -handle=$hand -batch=$batch -ref=$ref -metaOut=$output2
+    #if $individuals.choice == '0':
+      #set $geno = ''
+      #for $individual_col in $input.dataset.metadata.individual_columns
+        ##need to check to number of cols per individual
+        #if $input.ext == "gd_snp":
+           #set $t = $individual_col + 2
+        #else if $input.ext == "gd_genotype":
+           #set $t = $individual_col
+        #else:
+           #set $t = $individual_col
+        #end if
+        #set $geno += "%d," % ($t)
+      #end for
+      #if $individuals.pall_id != '':
+        -population=$individuals.pall_id
+      #end if
+    #else if $individuals.choice == '1':
+      #set $geno = ''
+      #set $pop = ''
+      #if $input.ext == "gd_snp":
+         -off=2
+      #else if $input.ext == "gd_genotype":
+         -off=0
+      #else:
+         -off=2
+      #end if
+      #for $population in $individuals.populations
+        #set $geno += "%s," % ($population.p1_input)
+        #set $pop += "%s," % ($population.p1_id)
+      #end for
+      -population=$pop
+    #else if $individuals.choice == '2':
+      #set $geno = $individuals.geno
+    #end if
+    -geno=$geno
+    #if $bioproj.value != '':
+       -bioproj=$bioproj
+    #end if
+    #if $biosamp.value != '':
+       -biosamp=$biosamp
+    #end if
+    > $output
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp,gd_genotype" label="SNP dataset" />
+    <conditional name="individuals">
+      <param name="choice" type="select" label="Generate dataset for">
+        <option value="0" selected="true">All individuals</option>
+        <option value="1">Individuals in populations</option>
+        <option value="2">A single individual</option>
+      </param>
+      <when value="0">
+        <param name="pall_id" type="text" size="20" label="ID for this population" help="Leaving this blank will omit allele counts from the output" />
+      </when>
+      <when value="1">
+        <repeat name="populations" title="Population" min="1">
+        <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+        <param name="p1_id" type="text" size="20" label="ID for this population" help="Leaving this blank will omit allele counts from the output" />
+        </repeat>
+      </when>
+      <when value="2">
+        <param name="geno" type="data_column" data_ref="input" label="Column containing genotype" value="8" />
+      </when>
+    </conditional> 
+    <param name="hand" type="text" size="20" label="dbSNP handle" help="If you do not have a handle, request one at http://www.ncbi.nlm.nih.gov/projects/SNP/handle.html" />
+    <param name="batch" type="text" size="20" label="Batch ID" help="ID used to tie dbSNP metadata to the VCF submission" />
+    <param name="ref" type="text" size="20" label="Reference sequence ID" help="The RefSeq assembly accession.version on which the SNP positions are based (see http://www.ncbi.nlm.nih.gov/assembly/)" />
+    <param name="bioproj" type="text" size="20" label="Optional: Registered BioProject ID" />
+    <param name="biosamp" type="text" size="20" label="Optional: Comma-separated list of registered BioSample IDs" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="vcf" />
+    <data name="output2" format="text" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="sample.gd_snp" ftype="gd_snp" />
+      <param name="choice" value="2" />
+      <param name="geno" value="11" />
+      <param name="hand" value="MyHandle" />
+      <param name="batch" value="Test1" />
+      <param name="ref" value="pb_000001.1" />
+      <output name="output" file="snpsForSubmission.vcf" ftype="vcf" compare="diff" />
+      <output name="output2" file="snpsForSubmission.text" ftype="text" compare="diff" />
+    </test>
+  </tests>
+
+  <help>
+
+**Dataset formats**
+
+The input dataset is in gd_snp_ or gd_genotype_ format.
+The output consists of two datasets needed for submitting SNPs:
+a VCF_ file in the specific format required by dbSNP, and a partially
+completed text_ file for the associated dbSNP metadata.
+(`Dataset missing?`_)
+
+.. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_genotype: ./static/formatHelp.html#gd_genotype
+.. _VCF: ./static/formatHelp.html#vcf
+.. _text: ./static/formatHelp.html#text
+.. _Dataset missing?: ./static/formatHelp.html
+
+-----
+
+**What it does**
+
+This tool converts a dataset in gd_snp or gd_genotype format to a VCF file formatted
+for submission to the dbSNP database at NCBI.  It also creates a partially
+filled-in template to assist you in preparing the required "metadata" file
+describing the SNP submission.
+
+-----
+
+**Example**
+
+- input::
+
+    #{"column_names":["scaf","pos","A","B","qual","ref","rpos","rnuc","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","pair","dist",
+    #"prim","rflp"],"dbkey":"canFam2","individuals":[["PB1",9],["PB2",13],["PB3",17],["PB4",21],["PB6",25],["PB8",29]],"pos":2,"rPos":7,"ref":6,"scaffold":1,"species":"bear"}
+    Contig161  115    C      T      73.5    chr1    4641382   C      6      0      2      45     8      0      2      51     15     0      2      72     5      0      2      42     6      0      2      45     10     0      2      57     Y      54     0.323   0
+    Contig48   11     A      G      94.3    chr1    10150264  A      1      0      2      30     1      0      2      30     1      0      2      30     3      0      2      36     1      0      2      30     1      0      2      30     Y      22     +99.    0
+    Contig20   66     C      T      54.0    chr1    21313534  C      4      0      2      39     4      0      2      39     5      0      2      42     4      0      2      39     4      0      2      39     5      0      2      42     N      1      +99.    0
+    etc.
+
+- VCF output (for all individuals, and giving a population ID)::
+
+    #CHROM     POS    ID               REF    ALT    QUAL    FILTER  INFO    FORMAT  PB
+    Contig161  115    Contig161;115    C      T      73.5    .       VRT=6   NA:AC   8:0
+    Contig48   11     Contig48;11      A      G      94.3    .       VRT=6   NA:AC   8:0
+    Contig     66     Contig20;66      C      T      54.0    .       VRT=6   NA:AC   8:0
+    etc.
+
+Note:  This excerpt from the output does not show all of the headers.  Also,
+if the population ID had not been given, then the last two columns would not
+appear in the output.
+
+-----
+
+**Reference**
+
+Sherry ST, Ward MH, Kholodov M, Baker J, Phan L, Smigielski EM, Sirotkin K.
+dbSNP: the NCBI database of genetic variation. Nucleic Acids Res. 2001
+Jan 1;29(1):308-11.
+
+  </help>
+</tool>