comparison VCFToolsFilter.pl @ 0:9dec9f724a50 draft

Uploaded
author dereeper
date Thu, 12 Feb 2015 15:37:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9dec9f724a50
1
2 #!/usr/bin/perl
3
4 use strict;
5 use Switch;
6 use Getopt::Long;
7 use Bio::SeqIO;
8
9 my $usage = qq~Usage:$0 <args> [<opts>]
10
11 where <args> are:
12
13 -i, --input <VCF input>
14 -o, --out <Output basename>
15
16 <opts> are:
17
18 -s, --samples <Samples to be analyzed. Comma separated list>
19 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list>
20 -e, --export <Output format (VCF/freq/plink. Default: VCF>
21 -f, --frequency <Minimum MAF. Default: 0.001>
22 -m, --max_freq <Maximum MAF. Default: 0.5>
23 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 1>
24 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4>
25 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL>
26 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
27 ~;
28 $usage .= "\n";
29
30 my ($input,$out);
31
32
33 #my $indel_size_max = 500;
34 #my $indel_size_min = 1;
35 my $frequency_max = 0.5;
36 my $frequency_min = 0.001;
37 my $pos_max = 100000000000;
38 my $pos_min = 0;
39 my $filter_snp_type = "all";
40
41 my $missing_data = 1;
42 my $export = "VCF";
43 my $type = "ALL";
44 my $nb_alleles;
45 my $bounds;
46 my $samples;
47 my $chromosomes;
48
49 GetOptions(
50 "input=s" => \$input,
51 "out=s" => \$out,
52 "samples=s" => \$samples,
53 "chromosomes=s" => \$chromosomes,
54 "frequency=s" => \$frequency_min,
55 "max_freq=s" => \$frequency_max,
56 "allow_missing=s"=> \$missing_data,
57 "export=s" => \$export,
58 "type=s" => \$type,
59 "nb_alleles=s" => \$nb_alleles,
60 "bounds=s" => \$bounds,
61 );
62
63
64 die $usage
65 if ( !$input || !$out);
66
67
68 my @dnasamples;
69 if ($samples)
70 {
71 @dnasamples = split(",",$samples);
72 }
73 my @nalleles;
74 if ($nb_alleles)
75 {
76 @nalleles = split(",",$nb_alleles);
77 }
78 my @boundaries;
79 if ($bounds)
80 {
81 @boundaries = split(",",$bounds);
82 }
83 my @chromosomes_list;
84 if ($chromosomes)
85 {
86 @chromosomes_list = split(",",$chromosomes);
87 }
88
89
90 my $experiment = "chromosomes";
91 my $table = "";
92 my %genes;
93 my @snp_ids;
94 my @snp_ids_and_positions;
95 my @snp_ids_and_positions_all;
96 my $gene;
97 my $snp_num = 0;
98 my %ref_sequences;
99 my %snps_of_gene;
100
101
102
103
104 my $indiv_cmd = "";
105 if (@dnasamples)
106 {
107 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
108 }
109
110 my $chrom_cmd = "";
111 if (@chromosomes_list)
112 {
113 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list);
114 }
115
116 my $export_cmd = "--recode";
117 if ($export eq "freq")
118 {
119 $export_cmd = "--freq";
120 }
121 if ($export eq "plink")
122 {
123 $export_cmd = "--plink";
124 }
125
126
127
128 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4";
129 if (@nalleles)
130 {
131 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]";
132 }
133 my $bounds_cmd = "--from-bp 1 --to-bp 100000000";
134 if (@boundaries)
135 {
136 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
137 }
138
139
140 my $type_cmd = "";
141 if ($type eq "INDEL")
142 {
143 $type_cmd = "--keep-only-indels";
144 }
145 if ($type eq "SNP")
146 {
147 $type_cmd = "--remove-indels";
148 }
149
150
151 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data >>vcftools.log 2>&1");
152
153
154
155
156
157
158