comparison make_phylip.xml @ 31:a631c2f6d913

Update to Miller Lab devshed revision 3c4110ffacc3
author Richard Burhans <burhans@bx.psu.edu>
date Fri, 20 Sep 2013 13:25:27 -0400
parents
children ea52b23f1141
comparison
equal deleted inserted replaced
30:4188853b940b 31:a631c2f6d913
1 <tool id="gd_make_phylip" name="Phylip" version="1.0.0" force_history_refresh="True">
2 <description>: prepare data for phylogenetic analysis</description>
3
4 <command interpreter="python">
5 #set $zero_based = 1
6 #set $gen_chrClmn = int($input.metadata.scaffold) - $zero_based
7 #set $gen_posClmn = int($input.metadata.pos) - $zero_based
8 #set $gen_refClmn = int($input.metadata.pos) - $zero_based + 1
9 #set $gen_altrClmn = int($input.metadata.pos) - $zero_based + 2
10 make_phylip.py '--altrClmn=$gen_altrClmn' '--chrClmn=$gen_chrClmn' '--gd_indivs=$indivs_input' '--input=$input' '--output=$output1' '--output_id=$output1.id' '--output_dir=$__new_file_path__' '--posClmn=$gen_posClmn' '--refClmn=$gen_refClmn'
11 #if $input_type.choice == '0'
12 #set $cov_chrClmn = int($input_type.coverage_input.metadata.scaffold) - $zero_based
13 #set $cov_posClmn = int($input_type.coverage_input.metadata.pos) - $zero_based
14 #set $cov_refClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 1
15 #set $cov_altrClmn = int($input_type.coverage_input.metadata.pos) - $zero_based + 2
16 '--altrClmnCvrg=$cov_altrClmn' '--chrClmnCvrg=$cov_chrClmn' '--cvrgTreshold=$input_type.coverage_threshold' '--gd_indivs_cover=$indivs_input' '--indvlsPrctTrshld=$input_type.indivs_threshold' '--inputCover=$input_type.coverage_input' '--posClmnCvrg=$cov_posClmn' '--refClmnCvrg=$cov_refClmn'
17 #else if $input_type.choice == '1'
18 #set $fchrClmn = int($input_type.annotation_input.metadata.chromCol) - $zero_based
19 #set $strandClmn = int($input_type.annotation_input.metadata.strandCol) - $zero_based
20 #set $geneNameClmn = int($input_type.annotation_input.metadata.nameCol) - $zero_based
21 #set $txStartClmn = int(str($input_type.tx_start_col)) - $zero_based
22 #set $txEndClmn = int(str($input_type.tx_end_col)) - $zero_based
23 #set $cdsStartClmn = int(str($input_type.cds_start_col)) - $zero_based
24 #set $cdsEndClmn = int(str($input_type.cds_end_col)) - $zero_based
25 #set $startExsClmn = int(str($input_type.exs_start_col)) - $zero_based
26 #set $endExsClmn = int(str($input_type.exs_end_col)) - $zero_based
27 '--cdsEndClmn=$cdsEndClmn' '--cdsStartClmn=$cdsStartClmn' '--endExsClmn=$endExsClmn' '--fchrClmn=$fchrClmn' '--geneNameClmn=$geneNameClmn' '--gene_info=$input_type.annotation_input' '--sequence=$input_type.fasta_input' '--startExsClmn=$startExsClmn' '--strandClmn=$strandClmn' '--txEndClmn=$txEndClmn' '--txStartClmn=$txStartClmn'
28 #end if
29 </command>
30
31 <inputs>
32 <param name="input" type="data" format="gd_genotype,gd_snp" label="Genotype/SNP dataset">
33 <validator type="metadata" check="scaffold" message="scaffold missing" />
34 <validator type="metadata" check="pos" message="pos missing" />
35 </param>
36 <param name="indivs_input" type="data" format="gd_indivs" label="Individuals dataset" />
37 <conditional name="input_type">
38 <param name="choice" type="select" format="integer" label="Input type">
39 <option value="0" selected="true">Coverage</option>
40 <option value="1">Genes</option>
41 </param>
42 <when value="0">
43 <param name="coverage_input" type="data" format="gd_genotype,gd_snp" label="Coverage dataset">
44 <validator type="metadata" check="scaffold" message="scaffold missing" />
45 <validator type="metadata" check="pos" message="pos missing" />
46 </param>
47 <param name="coverage_threshold" type="integer" min="1" value="1" label="Coverage threshold" />
48 <param name="indivs_threshold" type="float" value="0.5" min="0.0" max="1.0" label="Individuals genotype percentage threshold" />
49 </when>
50 <when value="1">
51 <param name="annotation_input" type="data" format="interval" label="Genes dataset">
52 <validator type="metadata" check="chromCol" message="chromCol missing" />
53 <validator type="metadata" check="strandCol" message="strandCol missing" />
54 <validator type="metadata" check="nameCol" message="nameCol missing" />
55 </param>
56 <param name="tx_start_col" type="data_column" data_ref="input" label="Genes transcript start column" />
57 <param name="tx_end_col" type="data_column" data_ref="input" label="Genes transcript end column" />
58 <param name="cds_start_col" type="data_column" data_ref="input" label="Genes coding sequence start column" />
59 <param name="cds_end_col" type="data_column" data_ref="input" label="Genes coding sequence end column" />
60 <param name="exs_start_col" type="data_column" data_ref="input" label="Genes exon starts column" />
61 <param name="exs_end_col" type="data_column" data_ref="input" label="Genes exon ends column" />
62 <param name="fasta_input" type="data" format="fasta" label="FASTA dataset" />
63 </when>
64 </conditional>
65 </inputs>
66
67 <outputs>
68 <data name="output1" format="txt" />
69 </outputs>
70
71 <help>
72 **What it does**
73
74 This tool creates phylip formatted files from two different input types:
75 coverage and genes.
76
77 If the coverage option is selected the inputs for the program are:
78
79 1. a gd_indivs table
80 2. a gd_genotype file with the coverage information for individuals in the gd_indivs table
81 3. a gd_genotype file with the genotype information for individuals in the gd_indivs table
82 4. a coverage threshold (optional)
83 5. a percentage of individuals (threshold).
84
85 The program produces a phylip formatted file using the sequence in the
86 genotype file as a template. In this sequence nucleotides for each
87 sequence that are below the coverage threshold, or the positions with
88 a percentage of individuals below the selected value are replaced by "N".
89
90 If the gene option is selected the inputs for the program are:
91
92 1. a gd_indivs table
93 2. a gene dataset table with a gene name in the first column
94 3. the column with transcript start in the gene dataset table
95 4. the column with transcript end in the gene dataset table
96 5. the column with coding start in the gene dataset table
97 6. the column with coding end in the gene dataset table
98 7. the column with exon starts (comma-separated) in the gene dataset table
99 8. the column with exon ends (comma-separated) in the gene dataset table
100 9. a FASTA formatted file for all the genes of interest with their names as headers (NOTE: these names should be the same in the input gene dataset table).
101
102 The program produces as output one phylip formatted file for each gene
103 in the gene dataset table.
104
105 -----
106
107 **Example**
108
109 In a case were the option coverage is selected, for the inputs:
110
111 - gd_indivs::
112
113 7 W_Java
114 10 E_Java
115 16 Pen_Ma
116 ...
117
118 - Genotype table::
119
120 chrM 15 T C -1 -1 2 -1 -1 2 -1 -1 -1 -1 -1 2 -1 -1 -1 -1 0 -1 -1
121 chrM 18 G A -1 -1 0 -1 -1 0 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
122 chrM 20 C T -1 -1 0 -1 -1 2 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
123 ...
124
125 - Coverage table::
126
127 chrM 0 G G 0 0 0 0 0 0 0 0 0 0 0 0 0
128 chrM 1 T T 0 0 3 0 0 50 0 0 0 0 0 2 0
129 chrM 2 T T 0 0 5 0 0 50 0 0 0 0 0 2 0
130 ...
131
132 - Coverage threshold = 0
133
134 - Percentage of individuals = 0.0
135
136 - The output is::
137
138 4 19 15428
139 W_Java GTTCATCATGTTCATCGAAT
140 E_Java GTTCATCATGTTCATCGAAC
141 Pen_Ma GTTCATCATGTTCATCGAAT
142
143 In a case were option genotype is selected with the inputs:
144
145 - Gene dataset table input::
146
147 1 ENSLAFT00000017123 chrM + 1002 1061 1002 1061 1 1002, 1061, 0 ENSLAFG00000017122 cmpl incmpl 0, BTRC ENSLAFT00000017123 ENSLAFP00000014355
148 1 ENSLAFT00000037164 chrM - 1058 1092 1062 1073 1 1062,1068 1065,1073 0 ENSLAFG00000007680 cmpl cmpl 0, MYOF ENSLAFT00000037164 ENSLAFP00000025175 26509
149 1 ENSLAFT00000008925 chrM + 990 1000 990 1000 1 990, 1000, 0 ENSLAFG00000008924 incmpl incmpl 0, PRKG1 ENSLAFT00000008925 ENSLAFP00000007492
150 ...
151
152 In this table:
153
154 column with transcript start = 5
155 column with transcript end = 6
156 column with coding start = 7
157 column with coding end = 8
158 column with exon starts = 10
159 column with exon ends = 11
160
161 - gd_indivs::
162
163 7 W_Java
164 10 E_Java
165 16 Pen_Ma
166 ...
167
168 - Genotype table::
169
170 chrM 1005 T C -1 -1 2 -1 -1 2 -1 -1 -1 -1 -1 2 -1 -1 -1 -1 0 -1 -1
171 chrM 1060 G A -1 -1 0 -1 -1 0 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
172 chrM 991 C T -1 -1 0 -1 -1 2 -1 -1 -1 -1 -1 0 -1 -1 -1 -1 0 -1 -1
173 ...
174
175 The outputs are going to one file for each sequence in the input gene
176 dataset table (as long as they are included in the input FASTA file).
177 </help>
178 </tool>