view FilterVCFonAnnotations.pl @ 2:15319113c0a5 draft

Uploaded
author dereeper
date Thu, 12 Feb 2015 15:54:24 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl

use strict;
use Getopt::Long;

my $usage = qq~Usage:$0 <args> [<opts>]

where <args> are:

    -i, --input         <VCF input>
    -o, --out           <VCF output>
      
      <opts> are:

    -g, --genelist      <file listing the genes to be filtered>
    -f, --feature       <genomic feature: Exon,INTRON,UTR_3_PRIME,UTR_5_PRIME,DOWNSTREAM,UPSTREAM or INTERGENIC>
    -s, --syn           <keep only synonymous (s), non-synonymous (n)>
~;
$usage .= "\n";

my ($input,$out);

my $filter_position = "";
my $filter_synonyme = "";

my $genelist;

GetOptions(
	"input=s"      => \$input,
	"out=s"        => \$out,
	"genelist=s"   => \$genelist,
	"feature=s"    => \$filter_position,
	"syn=s"        => \$filter_synonyme,
);


die $usage
  if ( !$input || !$out);


open(my $O1,">$out.header");
my $header = "";
open(my $VCF,$input);
while(<$VCF>)
{
	my $line = $_;
	chomp($line);
	my @infos = split(/\t/,$line);

	if (scalar @infos > 9)
        {
		if (!/^#/)
		{
			last;
		}		
	}
	print $O1 $_;
}
close($O1);

if ($filter_synonyme =~/\w/)
{
	if ($filter_synonyme eq "s")
	{
		system("grep ',SYNONYMOUS_' $input >$out.f1");
		system("grep '=SYNONYMOUS_' $input >$out.f2");
	}
	if ($filter_synonyme eq "n")
        {
		system("grep ',NON_SYNONYMOUS_' $input >$out.f1");
		system("grep '=NON_SYNONYMOUS_' $input >$out.f2");
	}	
	system("cat $out.header >$out.r1");
	system("cat $out.f1 >>$out.r1");
	system("cat $out.f2 >>$out.r1");
}
else
{
	system("cp -rf $input $out.r1");
}
if ($filter_position =~/\w+/)
{
	system("grep $filter_position $out.r1 >$out.f3");
	system("cat $out.header >>$out.r2");
	system("cat $out.f3 >>$out.r2");
}
else
{
	system("cp -rf $out.r1 $out.r2");
}

if ($genelist)
{
	my %genes_to_be_analyzed;
	open(my $F,$genelist);
	while(<$F>)
	{
		my $line = $_;
		$line =~s/\n//g;
		$line =~s/\r//g;
		$genes_to_be_analyzed{$line} = 1;
	}
	close($F);

	my $num_line = 0;
	open(my $OUT,">$out");
	open(my $VCF,"$out.r2");
	while(<$VCF>)
	{
		my $line = $_;
		chomp($line);
		my @infos = split(/\t/,$line);
		
		if (scalar @infos > 9)
		{
			if (!/^#/)
			{
				$num_line++;
				
				my $chromosome = $infos[0];
				my $chromosome_position = $infos[1];
				my $ref_allele = $infos[3];
				my $alt_allele = $infos[4];
				my $quality = $infos[6];
				my $info = $infos[7];
				my $is_in_exon = "#";
				my $is_synonyme = "#";
				my $gene = "intergenic";
				my $modif_codon = "#";
				my $modif_aa = "#";
				my $geneposition;
				if ($info =~/EFF=(.*)/)
				{
					my @annotations = split(",",$1);
					foreach my $annotation(@annotations)
					{
						my ($syn, $additional) = split(/\(/,$annotation);
	

						$is_in_exon = "intron";
						if ($syn =~/UTR/)
						{
							$is_in_exon = $syn;
						}
						else
						{
							$is_synonyme = $syn;
						}
						my @infos_additional = split(/\|/,$additional);
						$gene = $infos_additional[4];
					}
				}

				if (!$genes_to_be_analyzed{$gene}){next;}	
			}
		}
		print $OUT $_;
	}
	close($VCF);
	close($OUT);
}
else
{
	system("cp $out.r2 $out");
}
unlink("$out.f1");
unlink("$out.f2");	
unlink("$out.f3");
unlink("$out.r1");
unlink("$out.r2");
unlink("$out.header");