Mercurial > repos > dereeper > sniplay3
diff VCFToolFilter/VCFToolsFilter.pl @ 7:f11604d28633 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 20 Feb 2015 10:57:52 -0500 |
parents | 8d2d0b6c3521 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VCFToolFilter/VCFToolsFilter.pl Fri Feb 20 10:57:52 2015 -0500 @@ -0,0 +1,214 @@ + +#!/usr/bin/perl + +use strict; +use Switch; +use Getopt::Long; +use Bio::SeqIO; + +my $usage = qq~Usage:$0 <args> [<opts>] + +where <args> are: + + -i, --input <VCF input> + -o, --out <Output basename> + + <opts> are: + + -s, --samples <Samples to be analyzed. Comma separated list> + -c, --chromosomes <Chromosomes to be analyzed. Comma separated list> + -e, --export <Output format (VCF/freq/plink. Default: VCF> + -f, --frequency <Minimum MAF. Default: 0.001> + -m, --max_freq <Maximum MAF. Default: 0.5> + -a, --allow_missing <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0> + -n, --nb_alleles <Accepted number of alleles (min,max). Default: 2,4> + -t, --type <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL> + -b, --bounds <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000> +~; +$usage .= "\n"; + +my ($input,$out); + + +#my $indel_size_max = 500; +#my $indel_size_min = 1; +my $frequency_max = 0.5; +my $frequency_min = 0.001; +my $pos_max = 100000000000; +my $pos_min = 0; +my $filter_snp_type = "all"; + +my $missing_data = 0; +my $export = "VCF"; +my $type = "ALL"; +my $nb_alleles; +my $bounds; +my $samples; +my $chromosomes; + +GetOptions( + "input=s" => \$input, + "out=s" => \$out, + "samples=s" => \$samples, + "chromosomes=s" => \$chromosomes, + "frequency=s" => \$frequency_min, + "max_freq=s" => \$frequency_max, + "allow_missing=s"=> \$missing_data, + "export=s" => \$export, + "type=s" => \$type, + "nb_alleles=s" => \$nb_alleles, + "bounds=s" => \$bounds, +); + + +die $usage + if ( !$input || !$out); + +if ($samples && $samples =~/^([\w\,]+)\s*$/){ + $samples = $1; +} +elsif ($samples){ + die "Error: Samples must be a comma separated list of string\n"; +} +if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){ + $chromosomes = $1; +} +elsif($chromosomes){ + die "Error: Chromosomes must be a comma separated list of string\n"; +} +if ($bounds && $bounds =~/^([\d\,]+)\s*$/){ + $bounds = $1; +} +elsif($bounds){ + die "Error: Bounds must be a comma separated list of integers\n"; +} + +if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){ + $frequency_min = $1; +} +elsif ($frequency_min){ + die "Error: frequency must be an integer\n"; +} +if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){ + $frequency_max = $1; +} +elsif($frequency_max){ + die "Error: frequency must be an integer\n"; +} +if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){ + $missing_data = $1; +} +elsif ($missing_data){ + die "Error: Missing data must be an integer\n"; +} +if ($nb_alleles && $nb_alleles =~/^([\d\.]+)\s*$/){ + $nb_alleles = $1; +} +elsif($nb_alleles){ + die "Error: Nb alleles must be an integer\n"; +} +if ($export && $export =~/^([\w]+)\s*$/){ + $export = $1; +} +elsif($export){ + die "Error: Export must be a string\n"; +} +if ($type && $type =~/^([\w]+)\s*$/){ + $type = $1; +} +elsif($type){ + die "Error: Type must be a string\n"; +} + + +my @dnasamples; +if ($samples) +{ + @dnasamples = split(",",$samples); +} +my @nalleles; +if ($nb_alleles) +{ + @nalleles = split(",",$nb_alleles); +} +my @boundaries; +if ($bounds) +{ + @boundaries = split(",",$bounds); +} +my @chromosomes_list; +if ($chromosomes) +{ + @chromosomes_list = split(",",$chromosomes); +} + + +my $experiment = "chromosomes"; +my $table = ""; +my %genes; +my @snp_ids; +my @snp_ids_and_positions; +my @snp_ids_and_positions_all; +my $gene; +my $snp_num = 0; +my %ref_sequences; +my %snps_of_gene; + + + + +my $indiv_cmd = ""; +if (@dnasamples) +{ + $indiv_cmd = "--indv " . join(" --indv ",@dnasamples); +} + +my $chrom_cmd = ""; +if (@chromosomes_list) +{ + $chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list); +} + +my $export_cmd = "--recode"; +if ($export eq "freq") +{ + $export_cmd = "--freq"; +} +if ($export eq "plink") +{ + $export_cmd = "--plink"; +} + + + +my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4"; +if (@nalleles) +{ + $nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]"; +} +my $bounds_cmd = "--from-bp 1 --to-bp 100000000"; +if (@boundaries) +{ + $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]"; +} + + +my $type_cmd = ""; +if ($type eq "INDEL") +{ + $type_cmd = "--keep-only-indels"; +} +if ($type eq "SNP") +{ + $type_cmd = "--remove-indels"; +} + + +system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data"); + + + + + + +