Mercurial > repos > dereeper > sniplay3
comparison VCFToolFilter/VCFToolsFilter.pl @ 24:21d878747ac6 draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 23 Mar 2015 05:53:20 -0400 |
| parents | 1b672da89d00 |
| children |
comparison
equal
deleted
inserted
replaced
| 23:a1ab979f4551 | 24:21d878747ac6 |
|---|---|
| 1 | |
| 2 #!/usr/bin/perl | |
| 3 | |
| 4 use strict; | |
| 5 use Switch; | |
| 6 use Getopt::Long; | |
| 7 use Bio::SeqIO; | |
| 8 | |
| 9 my $usage = qq~Usage:$0 <args> [<opts>] | |
| 10 | |
| 11 where <args> are: | |
| 12 | |
| 13 -i, --input <VCF input> | |
| 14 -o, --out <Output basename> | |
| 15 | |
| 16 <opts> are: | |
| 17 | |
| 18 -s, --samples <Samples to be analyzed. Comma separated list> | |
| 19 -c, --chromosomes <Chromosomes to be analyzed. Comma separated list> | |
| 20 -e, --export <Output format (VCF/freq/plink. Default: VCF> | |
| 21 -f, --frequency <Minimum MAF. Default: 0.001> | |
| 22 -m, --max_freq <Maximum MAF. Default: 0.5> | |
| 23 -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0> | |
| 24 -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4> | |
| 25 -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL> | |
| 26 -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> | |
| 27 ~; | |
| 28 $usage .= "\n"; | |
| 29 | |
| 30 my ($input,$out); | |
| 31 | |
| 32 | |
| 33 #my $indel_size_max = 500; | |
| 34 #my $indel_size_min = 1; | |
| 35 my $frequency_max = 0.5; | |
| 36 my $frequency_min = 0.001; | |
| 37 my $pos_max = 100000000000; | |
| 38 my $pos_min = 0; | |
| 39 my $filter_snp_type = "all"; | |
| 40 | |
| 41 my $missing_data = 0; | |
| 42 my $export = "VCF"; | |
| 43 my $type = "ALL"; | |
| 44 my $nb_alleles; | |
| 45 my $bounds; | |
| 46 my $samples; | |
| 47 my $chromosomes; | |
| 48 | |
| 49 GetOptions( | |
| 50 "input=s" => \$input, | |
| 51 "out=s" => \$out, | |
| 52 "samples=s" => \$samples, | |
| 53 "chromosomes=s" => \$chromosomes, | |
| 54 "frequency=s" => \$frequency_min, | |
| 55 "max_freq=s" => \$frequency_max, | |
| 56 "allow_missing=s"=> \$missing_data, | |
| 57 "export=s" => \$export, | |
| 58 "type=s" => \$type, | |
| 59 "nb_alleles=s" => \$nb_alleles, | |
| 60 "bounds=s" => \$bounds, | |
| 61 ); | |
| 62 | |
| 63 | |
| 64 die $usage | |
| 65 if ( !$input || !$out); | |
| 66 | |
| 67 if ($samples && $samples =~/^([\w\,]+)\s*$/){ | |
| 68 $samples = $1; | |
| 69 } | |
| 70 elsif ($samples){ | |
| 71 die "Error: Samples must be a comma separated list of string\n"; | |
| 72 } | |
| 73 if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){ | |
| 74 $chromosomes = $1; | |
| 75 } | |
| 76 elsif($chromosomes){ | |
| 77 die "Error: Chromosomes must be a comma separated list of string\n"; | |
| 78 } | |
| 79 if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ | |
| 80 $bounds = $1; | |
| 81 } | |
| 82 elsif($bounds){ | |
| 83 die "Error: Bounds must be a comma separated list of integers\n"; | |
| 84 } | |
| 85 | |
| 86 if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){ | |
| 87 $frequency_min = $1; | |
| 88 } | |
| 89 elsif ($frequency_min){ | |
| 90 die "Error: frequency must be an integer\n"; | |
| 91 } | |
| 92 if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ | |
| 93 $frequency_max = $1; | |
| 94 } | |
| 95 elsif($frequency_max){ | |
| 96 die "Error: frequency must be an integer\n"; | |
| 97 } | |
| 98 if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){ | |
| 99 $missing_data = $1; | |
| 100 } | |
| 101 elsif ($missing_data){ | |
| 102 die "Error: Missing data must be an integer\n"; | |
| 103 } | |
| 104 if ($nb_alleles && $nb_alleles =~/^([\d\.\,]+)\s*$/){ | |
| 105 $nb_alleles = $1; | |
| 106 } | |
| 107 elsif($nb_alleles){ | |
| 108 die "Error: Nb alleles must be two integers\n"; | |
| 109 } | |
| 110 if ($export && $export =~/^([\w]+)\s*$/){ | |
| 111 $export = $1; | |
| 112 } | |
| 113 elsif($export){ | |
| 114 die "Error: Export must be a string\n"; | |
| 115 } | |
| 116 if ($type && $type =~/^([\w]+)\s*$/){ | |
| 117 $type = $1; | |
| 118 } | |
| 119 elsif($type){ | |
| 120 die "Error: Type must be a string\n"; | |
| 121 } | |
| 122 | |
| 123 | |
| 124 my @dnasamples; | |
| 125 if ($samples) | |
| 126 { | |
| 127 @dnasamples = split(",",$samples); | |
| 128 } | |
| 129 my @nalleles; | |
| 130 if ($nb_alleles) | |
| 131 { | |
| 132 @nalleles = split(",",$nb_alleles); | |
| 133 } | |
| 134 my @boundaries; | |
| 135 if ($bounds) | |
| 136 { | |
| 137 @boundaries = split(",",$bounds); | |
| 138 } | |
| 139 my @chromosomes_list; | |
| 140 if ($chromosomes) | |
| 141 { | |
| 142 @chromosomes_list = split(",",$chromosomes); | |
| 143 } | |
| 144 | |
| 145 | |
| 146 my $experiment = "chromosomes"; | |
| 147 my $table = ""; | |
| 148 my %genes; | |
| 149 my @snp_ids; | |
| 150 my @snp_ids_and_positions; | |
| 151 my @snp_ids_and_positions_all; | |
| 152 my $gene; | |
| 153 my $snp_num = 0; | |
| 154 my %ref_sequences; | |
| 155 my %snps_of_gene; | |
| 156 | |
| 157 | |
| 158 | |
| 159 | |
| 160 my $indiv_cmd = ""; | |
| 161 if (@dnasamples) | |
| 162 { | |
| 163 $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); | |
| 164 } | |
| 165 | |
| 166 my $chrom_cmd = ""; | |
| 167 if (@chromosomes_list) | |
| 168 { | |
| 169 $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list); | |
| 170 } | |
| 171 | |
| 172 my $export_cmd = "--recode"; | |
| 173 if ($export eq "freq") | |
| 174 { | |
| 175 $export_cmd = "--freq"; | |
| 176 } | |
| 177 if ($export eq "plink") | |
| 178 { | |
| 179 $export_cmd = "--plink"; | |
| 180 } | |
| 181 | |
| 182 | |
| 183 | |
| 184 my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4"; | |
| 185 if (@nalleles) | |
| 186 { | |
| 187 $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]"; | |
| 188 } | |
| 189 my $bounds_cmd = "--from-bp 1 --to-bp 100000000"; | |
| 190 if (@boundaries) | |
| 191 { | |
| 192 $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; | |
| 193 } | |
| 194 | |
| 195 | |
| 196 my $type_cmd = ""; | |
| 197 if ($type eq "INDEL") | |
| 198 { | |
| 199 $type_cmd = "--keep-only-indels"; | |
| 200 } | |
| 201 if ($type eq "SNP") | |
| 202 { | |
| 203 $type_cmd = "--remove-indels"; | |
| 204 } | |
| 205 | |
| 206 | |
| 207 system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data >>vcftools.log 2>&1"); | |
| 208 | |
| 209 | |
| 210 | |
| 211 | |
| 212 | |
| 213 | |
| 214 |
