Mercurial > repos > dereeper > sniplay3
comparison VCFToolFilter/VCFToolsFilter.pl @ 12:a03f54c420f1 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 20 Feb 2015 11:16:59 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
11:0b9d3dddafc9 | 12:a03f54c420f1 |
---|---|
1 | |
2 #!/usr/bin/perl | |
3 | |
4 use strict; | |
5 use Switch; | |
6 use Getopt::Long; | |
7 use Bio::SeqIO; | |
8 | |
9 my $usage = qq~Usage:$0 <args> [<opts>] | |
10 | |
11 where <args> are: | |
12 | |
13 -i, --input <VCF input> | |
14 -o, --out <Output basename> | |
15 | |
16 <opts> are: | |
17 | |
18 -s, --samples <Samples to be analyzed. Comma separated list> | |
19 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list> | |
20 -e, --export <Output format (VCF/freq/plink. Default: VCF> | |
21 -f, --frequency <Minimum MAF. Default: 0.001> | |
22 -m, --max_freq <Maximum MAF. Default: 0.5> | |
23 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0> | |
24 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4> | |
25 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL> | |
26 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> | |
27 ~; | |
28 $usage .= "\n"; | |
29 | |
30 my ($input,$out); | |
31 | |
32 | |
33 #my $indel_size_max = 500; | |
34 #my $indel_size_min = 1; | |
35 my $frequency_max = 0.5; | |
36 my $frequency_min = 0.001; | |
37 my $pos_max = 100000000000; | |
38 my $pos_min = 0; | |
39 my $filter_snp_type = "all"; | |
40 | |
41 my $missing_data = 0; | |
42 my $export = "VCF"; | |
43 my $type = "ALL"; | |
44 my $nb_alleles; | |
45 my $bounds; | |
46 my $samples; | |
47 my $chromosomes; | |
48 | |
49 GetOptions( | |
50 "input=s" => \$input, | |
51 "out=s" => \$out, | |
52 "samples=s" => \$samples, | |
53 "chromosomes=s" => \$chromosomes, | |
54 "frequency=s" => \$frequency_min, | |
55 "max_freq=s" => \$frequency_max, | |
56 "allow_missing=s"=> \$missing_data, | |
57 "export=s" => \$export, | |
58 "type=s" => \$type, | |
59 "nb_alleles=s" => \$nb_alleles, | |
60 "bounds=s" => \$bounds, | |
61 ); | |
62 | |
63 | |
64 die $usage | |
65 if ( !$input || !$out); | |
66 | |
67 if ($samples && $samples =~/^([\w\,]+)\s*$/){ | |
68 $samples = $1; | |
69 } | |
70 elsif ($samples){ | |
71 die "Error: Samples must be a comma separated list of string\n"; | |
72 } | |
73 if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){ | |
74 $chromosomes = $1; | |
75 } | |
76 elsif($chromosomes){ | |
77 die "Error: Chromosomes must be a comma separated list of string\n"; | |
78 } | |
79 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ | |
80 $bounds = $1; | |
81 } | |
82 elsif($bounds){ | |
83 die "Error: Bounds must be a comma separated list of integers\n"; | |
84 } | |
85 | |
86 if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){ | |
87 $frequency_min = $1; | |
88 } | |
89 elsif ($frequency_min){ | |
90 die "Error: frequency must be an integer\n"; | |
91 } | |
92 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ | |
93 $frequency_max = $1; | |
94 } | |
95 elsif($frequency_max){ | |
96 die "Error: frequency must be an integer\n"; | |
97 } | |
98 if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){ | |
99 $missing_data = $1; | |
100 } | |
101 elsif ($missing_data){ | |
102 die "Error: Missing data must be an integer\n"; | |
103 } | |
104 if ($nb_alleles && $nb_alleles =~/^([\d\.\,]+)\s*$/){ | |
105 $nb_alleles = $1; | |
106 } | |
107 elsif($nb_alleles){ | |
108 die "Error: Nb alleles must be two integers\n"; | |
109 } | |
110 if ($export && $export =~/^([\w]+)\s*$/){ | |
111 $export = $1; | |
112 } | |
113 elsif($export){ | |
114 die "Error: Export must be a string\n"; | |
115 } | |
116 if ($type && $type =~/^([\w]+)\s*$/){ | |
117 $type = $1; | |
118 } | |
119 elsif($type){ | |
120 die "Error: Type must be a string\n"; | |
121 } | |
122 | |
123 | |
124 my @dnasamples; | |
125 if ($samples) | |
126 { | |
127 @dnasamples = split(",",$samples); | |
128 } | |
129 my @nalleles; | |
130 if ($nb_alleles) | |
131 { | |
132 @nalleles = split(",",$nb_alleles); | |
133 } | |
134 my @boundaries; | |
135 if ($bounds) | |
136 { | |
137 @boundaries = split(",",$bounds); | |
138 } | |
139 my @chromosomes_list; | |
140 if ($chromosomes) | |
141 { | |
142 @chromosomes_list = split(",",$chromosomes); | |
143 } | |
144 | |
145 | |
146 my $experiment = "chromosomes"; | |
147 my $table = ""; | |
148 my %genes; | |
149 my @snp_ids; | |
150 my @snp_ids_and_positions; | |
151 my @snp_ids_and_positions_all; | |
152 my $gene; | |
153 my $snp_num = 0; | |
154 my %ref_sequences; | |
155 my %snps_of_gene; | |
156 | |
157 | |
158 | |
159 | |
160 my $indiv_cmd = ""; | |
161 if (@dnasamples) | |
162 { | |
163 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); | |
164 } | |
165 | |
166 my $chrom_cmd = ""; | |
167 if (@chromosomes_list) | |
168 { | |
169 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list); | |
170 } | |
171 | |
172 my $export_cmd = "--recode"; | |
173 if ($export eq "freq") | |
174 { | |
175 $export_cmd = "--freq"; | |
176 } | |
177 if ($export eq "plink") | |
178 { | |
179 $export_cmd = "--plink"; | |
180 } | |
181 | |
182 | |
183 | |
184 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4"; | |
185 if (@nalleles) | |
186 { | |
187 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]"; | |
188 } | |
189 my $bounds_cmd = "--from-bp 1 --to-bp 100000000"; | |
190 if (@boundaries) | |
191 { | |
192 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; | |
193 } | |
194 | |
195 | |
196 my $type_cmd = ""; | |
197 if ($type eq "INDEL") | |
198 { | |
199 $type_cmd = "--keep-only-indels"; | |
200 } | |
201 if ($type eq "SNP") | |
202 { | |
203 $type_cmd = "--remove-indels"; | |
204 } | |
205 | |
206 | |
207 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data"); | |
208 | |
209 | |
210 | |
211 | |
212 | |
213 | |
214 |