0
|
1
|
|
2 #!/usr/bin/perl
|
|
3
|
|
4 use strict;
|
|
5 use Getopt::Long;
|
|
6 use Bio::SeqIO;
|
|
7
|
|
8 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
9
|
|
10 where <args> are:
|
|
11
|
|
12 -i, --input <VCF input>
|
|
13 -o, --out <Output basename>
|
|
14
|
|
15 <opts> are:
|
|
16
|
|
17 -s, --samples <Samples to be analyzed. Comma separated list>
|
|
18 -c, --chromosomes <List of chromosomes to be analyzed.>
|
|
19 -e, --export <Output format (VCF/freq/plink. Default: VCF>
|
|
20 -f, --frequency <Minimum MAF. Default: 0.001>
|
|
21 -m, --max_freq <Maximum MAF. Default: 0.5>
|
|
22 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
|
|
23 -t, --type <Type of polymorphisms to keep (ALL/SNP). Default: ALL>
|
|
24 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
|
|
25 -r, --remove_filt <Remove all sites with a FILTER flag other than PASS (true/false). Default: false>
|
|
26 -d, --distance <Thin sites so that no two sites are within the specified distance from one another. Default: 0>
|
|
27 ~;
|
|
28 $usage .= "\n";
|
|
29
|
|
30 my ($input,$out);
|
|
31
|
|
32 my $PLINK_EXE = "plink";
|
|
33
|
|
34
|
|
35 #my $indel_size_max = 500;
|
|
36 #my $indel_size_min = 1;
|
|
37 my $frequency_max = 0.5;
|
|
38 my $frequency_min = 0.001;
|
|
39 my $pos_max = 100000000000;
|
|
40 my $pos_min = 0;
|
|
41 my $filter_snp_type = "all";
|
|
42 my $remove_filt = "False";
|
|
43
|
|
44 my $missing_data = 1;
|
|
45 my $export = "VCF";
|
|
46 my $type = "ALL";
|
|
47 my $bounds;
|
|
48 my $samples;
|
|
49 my $chromosomes;
|
|
50 my $thin;
|
|
51
|
|
52 GetOptions(
|
|
53 "input=s" => \$input,
|
|
54 "out=s" => \$out,
|
|
55 "samples=s" => \$samples,
|
|
56 "chromosomes=s" => \$chromosomes,
|
|
57 "frequency=s" => \$frequency_min,
|
|
58 "max_freq=s" => \$frequency_max,
|
|
59 "allow_missing=s"=> \$missing_data,
|
|
60 "export=s" => \$export,
|
|
61 "type=s" => \$type,
|
|
62 "bounds=s" => \$bounds,
|
|
63 "remove_filt=s" => \$remove_filt,
|
|
64 "distance=s" => \$thin
|
|
65 );
|
|
66
|
|
67
|
|
68 die $usage
|
|
69 if ( !$input || !$out);
|
|
70
|
|
71 if ($samples && $samples =~/^([\w\,\-\.]+)\s*$/){
|
|
72 $samples = $1;
|
|
73 }
|
|
74 elsif ($samples){
|
|
75 die "Error: Samples must be a comma separated list of string\n";
|
|
76 }
|
|
77 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
|
|
78 $bounds = $1;
|
|
79 }
|
|
80 elsif($bounds){
|
|
81 die "Error: Bounds must be a comma separated list of integers\n";
|
|
82 }
|
|
83
|
|
84 my $minfreq_cmd = "";
|
|
85 if ($frequency_min && $frequency_min > 0 && $frequency_min =~/^([\d\.]+)\s*$/){
|
|
86 $frequency_min = $1;
|
|
87 $minfreq_cmd = "--maf $frequency_min";
|
|
88 }
|
|
89 elsif ($frequency_min == 0){
|
|
90 $minfreq_cmd = "";
|
|
91 }
|
|
92 elsif ($frequency_min){
|
|
93 die "Error: frequency must be an integer\n";
|
|
94 }
|
|
95 if ($thin && $thin =~/^([\d\.]+)\s*$/){
|
|
96 $thin = $1;
|
|
97 }
|
|
98 elsif ($thin){
|
|
99 die "Error: frequency must be an integer\n";
|
|
100 }
|
|
101 my $maxfreq_cmd = "";
|
|
102 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
|
|
103 $frequency_max = $1;
|
|
104 if ($frequency_max < 0.5){
|
|
105 $maxfreq_cmd = "--max-maf $frequency_max";
|
|
106 }
|
|
107 }
|
|
108 elsif($frequency_max){
|
|
109 die "Error: frequency must be an integer\n";
|
|
110 }
|
|
111 if ($missing_data =~/^([\d\.]+)\s*$/){
|
|
112 $missing_data = $1;
|
|
113 #$missing_data = 1 - $missing_data;
|
|
114 }
|
|
115 elsif ($missing_data){
|
|
116 die "Error: Missing data must be an integer\n";
|
|
117 }
|
|
118 if ($export && $export =~/^([\w]+)\s*$/){
|
|
119 $export = $1;
|
|
120 }
|
|
121 elsif($export){
|
|
122 die "Error: Export must be a string\n";
|
|
123 }
|
|
124 if ($type && $type =~/^([\w]+)\s*$/){
|
|
125 $type = $1;
|
|
126 }
|
|
127 elsif($type){
|
|
128 die "Error: Type must be a string\n";
|
|
129 }
|
|
130
|
|
131
|
|
132 my @dnasamples;
|
|
133 if ($samples)
|
|
134 {
|
|
135 @dnasamples = split(",",$samples);
|
|
136 }
|
|
137 my @boundaries;
|
|
138 if ($bounds)
|
|
139 {
|
|
140 @boundaries = split(",",$bounds);
|
|
141 }
|
|
142
|
|
143
|
|
144 my $experiment = "chromosomes";
|
|
145 my $table = "";
|
|
146 my %genes;
|
|
147 my @snp_ids;
|
|
148 my @snp_ids_and_positions;
|
|
149 my @snp_ids_and_positions_all;
|
|
150 my $gene;
|
|
151 my $snp_num = 0;
|
|
152 my %ref_sequences;
|
|
153 my %snps_of_gene;
|
|
154
|
|
155 my $indiv_cmd = "";
|
|
156 if (@dnasamples)
|
|
157 {
|
|
158 if (scalar @dnasamples > 1)
|
|
159 {
|
|
160 open(my $S,">$out.samples");
|
|
161 foreach my $samp(@dnasamples){
|
|
162 print $S "$samp $samp\n";
|
|
163 }
|
|
164 close($S);
|
|
165 $indiv_cmd = "--keep $out.samples ";
|
|
166 }
|
|
167 else
|
|
168 {
|
|
169 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
|
|
170 }
|
|
171 }
|
|
172
|
|
173 my $chrom_cmd = "";
|
|
174 if ($chromosomes)
|
|
175 {
|
|
176 $chrom_cmd = "--chr ".$chromosomes
|
|
177 }
|
|
178
|
|
179 my $export_cmd = "--recode vcf-iid";
|
|
180 if ($export eq "bcf"){
|
|
181 $export_cmd = "--recode bcf";
|
|
182 }
|
|
183 if ($export eq "freq"){
|
|
184 $export_cmd = "--freq";
|
|
185 }
|
|
186 if ($export eq "plink"){
|
|
187 $export_cmd = "--make-bed";
|
|
188 }
|
|
189 if ($export eq "bed"){
|
|
190 $export_cmd = "--make-bed";
|
|
191 }
|
|
192
|
|
193
|
|
194 my $bounds_cmd = "";
|
|
195 if (@boundaries && $chrom_cmd=~/\w/ && $chrom_cmd !~/,/)
|
|
196 {
|
|
197 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
|
|
198 }
|
|
199
|
|
200
|
|
201
|
|
202 my $type_cmd = "";
|
|
203 if ($type eq "SNP")
|
|
204 {
|
|
205 $type_cmd = "--snps-only";
|
|
206 }
|
|
207
|
|
208 my $filt_cmd = "";
|
|
209 if ($remove_filt eq "true")
|
|
210 {
|
|
211 $filt_cmd = "--remove-filtered-all";
|
|
212 }
|
|
213
|
|
214 my $thin_cmd = "";
|
|
215 if ($thin){
|
|
216 $thin_cmd = "--bp-space $thin";
|
|
217 }
|
|
218
|
|
219 #my $bcf_input = $input;
|
|
220 #$bcf_input =~s/vcf/bcf/g;
|
|
221 my $bcf_input;
|
|
222 my $bed_input = $input;
|
|
223 $bed_input =~s/\.bed//g;
|
|
224
|
|
225 if (-e "$bed_input.bed"){
|
|
226 system("$PLINK_EXE --bfile $bed_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
|
|
227 # for first 1000 SNPs
|
|
228 system("$PLINK_EXE --bfile $bed_input --out $out.recode $type_cmd --recode vcf-fid $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr --thin-count 800 1>$out.2.plink.stdout 2>$out.2.plink.stderr");
|
|
229 }
|
|
230 elsif (-e $bcf_input){
|
|
231 system("$PLINK_EXE --bcf $bcf_input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.plink.stdout 2>$out.plink.stderr");
|
|
232 }
|
|
233 else
|
|
234 {
|
|
235 system("$PLINK_EXE --vcf $input --out $out $type_cmd $export_cmd $chrom_cmd $indiv_cmd $minfreq_cmd $maxfreq_cmd --geno $missing_data $thin_cmd $bounds_cmd --allow-extra-chr 1>$out.3.plink.stdout 2>$out.3.plink.stderr");
|
|
236
|
|
237 }
|
|
238
|
|
239
|
|
240
|
|
241
|
|
242
|