diff VCFToolFilter/VCFToolsFilter.pl @ 5:8d2d0b6c3521 draft

Uploaded
author dereeper
date Fri, 20 Feb 2015 10:52:02 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VCFToolFilter/VCFToolsFilter.pl	Fri Feb 20 10:52:02 2015 -0500
@@ -0,0 +1,214 @@
+
+#!/usr/bin/perl
+
+use strict;
+use Switch;
+use Getopt::Long;
+use Bio::SeqIO;
+
+my $usage = qq~Usage:$0 <args> [<opts>]
+
+where <args> are:
+
+    -i, --input          <VCF input>
+    -o, --out            <Output basename>
+      
+      <opts> are:
+
+    -s, --samples        <Samples to be analyzed. Comma separated list>
+    -c, --chromosomes    <Chromosomes to be analyzed. Comma separated list>
+    -e, --export         <Output format (VCF/freq/plink. Default: VCF>
+    -f, --frequency      <Minimum MAF. Default: 0.001>
+    -m, --max_freq       <Maximum MAF. Default: 0.5>
+    -a, --allow_missing  <Allowed missing data proportion per site. Must be comprised between 0 and 1. Default: 0>
+    -n, --nb_alleles     <Accepted number of alleles (min,max). Default: 2,4>
+    -t, --type           <Type of polymorphisms to keep (ALL/SNP/INDEL). Default: ALL>
+    -b, --bounds         <Lower bound and upper bound for a range of sites to be processed (start,end). Default: 1, 100000000>
+~;
+$usage .= "\n";
+
+my ($input,$out);
+
+
+#my $indel_size_max = 500;
+#my $indel_size_min = 1;
+my $frequency_max = 0.5;
+my $frequency_min = 0.001;
+my $pos_max = 100000000000;
+my $pos_min = 0;
+my $filter_snp_type = "all";
+
+my $missing_data = 0;
+my $export = "VCF";
+my $type = "ALL";
+my $nb_alleles;
+my $bounds;
+my $samples;
+my $chromosomes;
+
+GetOptions(
+	"input=s"        => \$input,
+	"out=s"          => \$out,
+	"samples=s"      => \$samples,
+	"chromosomes=s"  => \$chromosomes,
+	"frequency=s"    => \$frequency_min,
+	"max_freq=s"     => \$frequency_max,
+	"allow_missing=s"=> \$missing_data,
+	"export=s"       => \$export,
+	"type=s"         => \$type,
+	"nb_alleles=s"   => \$nb_alleles,
+	"bounds=s"       => \$bounds,
+);
+
+
+die $usage
+  if ( !$input || !$out);
+
+if ($samples && $samples =~/^([\w\,]+)\s*$/){
+        $samples = $1;
+}
+elsif ($samples){
+        die "Error: Samples must be a comma separated list of string\n";
+}
+if ($chromosomes && $chromosomes =~/^([\w\,]+)\s*$/){
+        $chromosomes = $1;
+}
+elsif($chromosomes){
+        die "Error: Chromosomes must be a comma separated list of string\n";
+}
+if ($bounds && $bounds =~/^([\d\,]+)\s*$/){
+        $bounds = $1;
+}
+elsif($bounds){
+        die "Error: Bounds must be a comma separated list of integers\n";
+}
+
+if ($frequency_min && $frequency_min =~/^([\d\.]+)\s*$/){
+        $frequency_min = $1;
+}
+elsif ($frequency_min){
+        die "Error: frequency must be an integer\n";
+}
+if ($frequency_max && $frequency_max =~/^([\d\.]+)\s*$/){
+        $frequency_max = $1;
+}
+elsif($frequency_max){
+        die "Error: frequency must be an integer\n";
+}
+if ($missing_data && $missing_data =~/^([\d\.]+)\s*$/){
+        $missing_data = $1;
+}
+elsif ($missing_data){
+        die "Error: Missing data must be an integer\n";
+}
+if ($nb_alleles && $nb_alleles =~/^([\d\.]+)\s*$/){
+        $nb_alleles = $1;
+}
+elsif($nb_alleles){
+        die "Error: Nb alleles must be an integer\n";
+}
+if ($export && $export =~/^([\w]+)\s*$/){
+        $export = $1;
+}
+elsif($export){
+        die "Error: Export must be a string\n";
+}
+if ($type && $type =~/^([\w]+)\s*$/){
+        $type = $1;
+}
+elsif($type){
+        die "Error: Type must be a string\n";
+}
+
+
+my @dnasamples;
+if ($samples)
+{
+	@dnasamples = split(",",$samples);
+}
+my @nalleles;
+if ($nb_alleles)
+{
+	@nalleles = split(",",$nb_alleles);
+}
+my @boundaries;
+if ($bounds)
+{
+	@boundaries = split(",",$bounds);
+}
+my @chromosomes_list;
+if ($chromosomes)
+{
+	@chromosomes_list = split(",",$chromosomes);
+}
+
+
+my $experiment = "chromosomes";
+my $table = "";
+my %genes;
+my @snp_ids;
+my @snp_ids_and_positions;
+my @snp_ids_and_positions_all;
+my $gene;
+my $snp_num = 0;
+my %ref_sequences;
+my %snps_of_gene;
+
+	
+
+
+my $indiv_cmd = "";
+if (@dnasamples)
+{
+	$indiv_cmd = "--indv " . join(" --indv ",@dnasamples);
+}
+
+my $chrom_cmd = "";
+if (@chromosomes_list)
+{
+	$chrom_cmd = "--chr " . join(" --chr ",@chromosomes_list);
+}
+
+my $export_cmd = "--recode";
+if ($export eq "freq")
+{
+	$export_cmd = "--freq";
+}
+if ($export eq "plink")
+{
+	$export_cmd = "--plink";
+}
+ 
+
+
+my $nb_alleles_cmd = "--min-alleles 1 --max-alleles 4";
+if (@nalleles)
+{
+	$nb_alleles_cmd = "--min-alleles $nalleles[0] --max-alleles $nalleles[1]";
+}
+my $bounds_cmd = "--from-bp 1 --to-bp 100000000";
+if (@boundaries)
+{
+        $bounds_cmd = "--from-bp $boundaries[0] --to-bp $boundaries[1]";
+}
+
+ 
+my $type_cmd = "";
+if ($type eq "INDEL")
+{
+	$type_cmd = "--keep-only-indels";
+}
+if ($type eq "SNP")
+{
+	$type_cmd = "--remove-indels";
+}
+
+	
+system("vcftools --vcf $input --out $out --keep-INFO-all --remove-filtered-all $type_cmd $export_cmd $chrom_cmd $indiv_cmd $nb_alleles_cmd --maf $frequency_min --max-maf $frequency_max --max-missing $missing_data");
+
+
+
+
+
+	
+