12
|
1
|
|
2 #!/usr/bin/perl
|
|
3
|
|
4 use strict;
|
|
5 use Switch;
|
|
6 use Getopt::Long;
|
|
7 use Bio::SeqIO;
|
|
8
|
|
9 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
10
|
|
11 where <args> are:
|
|
12
|
|
13 -i, --input <VCF input>
|
|
14 -o, --out <Output basename>
|
|
15
|
|
16 <opts> are:
|
|
17
|
|
18 -s, --samples <Samples to be analyzed. Comma separated list>
|
|
19 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list>
|
|
20 -e, --export <Output format (VCF/freq/plink. Default: VCF>
|
|
21 -f, --frequency <Minimum MAF. Default: 0.001>
|
|
22 -m, --max_freq <Maximum MAF. Default: 0.5>
|
|
23 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0>
|
|
24 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4>
|
|
25 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL>
|
|
26 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
|
|
27 ~;
|
|
28 $usage .= "\n";
|
|
29
|
|
30 my ($input,$out);
|
|
31
|
|
32
|
|
33 #my $indel_size_max = 500;
|
|
34 #my $indel_size_min = 1;
|
|
35 my $frequency_max = 0.5;
|
|
36 my $frequency_min = 0.001;
|
|
37 my $pos_max = 100000000000;
|
|
38 my $pos_min = 0;
|
|
39 my $filter_snp_type = "all";
|
|
40
|
|
41 my $missing_data = 0;
|
|
42 my $export = "VCF";
|
|
43 my $type = "ALL";
|
|
44 my $nb_alleles;
|
|
45 my $bounds;
|
|
46 my $samples;
|
|
47 my $chromosomes;
|
|
48
|
|
49 GetOptions(
|
|
50 "input=s" => \$input,
|
|
51 "out=s" => \$out,
|
|
52 "samples=s" => \$samples,
|
|
53 "chromosomes=s" => \$chromosomes,
|
|
54 "frequency=s" => \$frequency_min,
|
|
55 "max_freq=s" => \$frequency_max,
|
|
56 "allow_missing=s"=> \$missing_data,
|
|
57 "export=s" => \$export,
|
|
58 "type=s" => \$type,
|
|
59 "nb_alleles=s" => \$nb_alleles,
|
|
60 "bounds=s" => \$bounds,
|
|
61 );
|
|
62
|
|
63
|
|
64 die $usage
|
|
65 if ( !$input || !$out);
|
|
66
|
|
67 if ($samples && $samples =~/^([\w\,]+)\s*$/){
|
|
68 $samples = $1;
|
|
69 }
|
|
70 elsif ($samples){
|
|
71 die "Error: Samples must be a comma separated list of string\n";
|
|
72 }
|
|
73 if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){
|
|
74 $chromosomes = $1;
|
|
75 }
|
|
76 elsif($chromosomes){
|
|
77 die "Error: Chromosomes must be a comma separated list of string\n";
|
|
78 }
|
|
79 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
|
|
80 $bounds = $1;
|
|
81 }
|
|
82 elsif($bounds){
|
|
83 die "Error: Bounds must be a comma separated list of integers\n";
|
|
84 }
|
|
85
|
|
86 if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){
|
|
87 $frequency_min = $1;
|
|
88 }
|
|
89 elsif ($frequency_min){
|
|
90 die "Error: frequency must be an integer\n";
|
|
91 }
|
|
92 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
|
|
93 $frequency_max = $1;
|
|
94 }
|
|
95 elsif($frequency_max){
|
|
96 die "Error: frequency must be an integer\n";
|
|
97 }
|
|
98 if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){
|
|
99 $missing_data = $1;
|
|
100 }
|
|
101 elsif ($missing_data){
|
|
102 die "Error: Missing data must be an integer\n";
|
|
103 }
|
|
104 if ($nb_alleles && $nb_alleles =~/^([\d\.\,]+)\s*$/){
|
|
105 $nb_alleles = $1;
|
|
106 }
|
|
107 elsif($nb_alleles){
|
|
108 die "Error: Nb alleles must be two integers\n";
|
|
109 }
|
|
110 if ($export && $export =~/^([\w]+)\s*$/){
|
|
111 $export = $1;
|
|
112 }
|
|
113 elsif($export){
|
|
114 die "Error: Export must be a string\n";
|
|
115 }
|
|
116 if ($type && $type =~/^([\w]+)\s*$/){
|
|
117 $type = $1;
|
|
118 }
|
|
119 elsif($type){
|
|
120 die "Error: Type must be a string\n";
|
|
121 }
|
|
122
|
|
123
|
|
124 my @dnasamples;
|
|
125 if ($samples)
|
|
126 {
|
|
127 @dnasamples = split(",",$samples);
|
|
128 }
|
|
129 my @nalleles;
|
|
130 if ($nb_alleles)
|
|
131 {
|
|
132 @nalleles = split(",",$nb_alleles);
|
|
133 }
|
|
134 my @boundaries;
|
|
135 if ($bounds)
|
|
136 {
|
|
137 @boundaries = split(",",$bounds);
|
|
138 }
|
|
139 my @chromosomes_list;
|
|
140 if ($chromosomes)
|
|
141 {
|
|
142 @chromosomes_list = split(",",$chromosomes);
|
|
143 }
|
|
144
|
|
145
|
|
146 my $experiment = "chromosomes";
|
|
147 my $table = "";
|
|
148 my %genes;
|
|
149 my @snp_ids;
|
|
150 my @snp_ids_and_positions;
|
|
151 my @snp_ids_and_positions_all;
|
|
152 my $gene;
|
|
153 my $snp_num = 0;
|
|
154 my %ref_sequences;
|
|
155 my %snps_of_gene;
|
|
156
|
|
157
|
|
158
|
|
159
|
|
160 my $indiv_cmd = "";
|
|
161 if (@dnasamples)
|
|
162 {
|
|
163 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
|
|
164 }
|
|
165
|
|
166 my $chrom_cmd = "";
|
|
167 if (@chromosomes_list)
|
|
168 {
|
|
169 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list);
|
|
170 }
|
|
171
|
|
172 my $export_cmd = "--recode";
|
|
173 if ($export eq "freq")
|
|
174 {
|
|
175 $export_cmd = "--freq";
|
|
176 }
|
|
177 if ($export eq "plink")
|
|
178 {
|
|
179 $export_cmd = "--plink";
|
|
180 }
|
|
181
|
|
182
|
|
183
|
|
184 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4";
|
|
185 if (@nalleles)
|
|
186 {
|
|
187 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]";
|
|
188 }
|
|
189 my $bounds_cmd = "--from-bp 1 --to-bp 100000000";
|
|
190 if (@boundaries)
|
|
191 {
|
|
192 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
|
|
193 }
|
|
194
|
|
195
|
|
196 my $type_cmd = "";
|
|
197 if ($type eq "INDEL")
|
|
198 {
|
|
199 $type_cmd = "--keep-only-indels";
|
|
200 }
|
|
201 if ($type eq "SNP")
|
|
202 {
|
|
203 $type_cmd = "--remove-indels";
|
|
204 }
|
|
205
|
|
206
|
|
207 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data");
|
|
208
|
|
209
|
|
210
|
|
211
|
|
212
|
|
213
|
|
214
|