diff FilterVCFonAnnotations.pl @ 2:15319113c0a5 draft

Uploaded
author dereeper
date Thu, 12 Feb 2015 15:54:24 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FilterVCFonAnnotations.pl	Thu Feb 12 15:54:24 2015 -0500
@@ -0,0 +1,171 @@
+#!/usr/bin/perl
+
+use strict;
+use Getopt::Long;
+
+my $usage = qq~Usage:$0 <args> [<opts>]
+
+where <args> are:
+
+    -i, --input         <VCF input>
+    -o, --out           <VCF output>
+      
+      <opts> are:
+
+    -g, --genelist      <file listing the genes to be filtered>
+    -f, --feature       <genomic feature: Exon,INTRON,UTR_3_PRIME,UTR_5_PRIME,DOWNSTREAM,UPSTREAM or INTERGENIC>
+    -s, --syn           <keep only synonymous (s), non-synonymous (n)>
+~;
+$usage .= "\n";
+
+my ($input,$out);
+
+my $filter_position = "";
+my $filter_synonyme = "";
+
+my $genelist;
+
+GetOptions(
+	"input=s"      => \$input,
+	"out=s"        => \$out,
+	"genelist=s"   => \$genelist,
+	"feature=s"    => \$filter_position,
+	"syn=s"        => \$filter_synonyme,
+);
+
+
+die $usage
+  if ( !$input || !$out);
+
+
+open(my $O1,">$out.header");
+my $header = "";
+open(my $VCF,$input);
+while(<$VCF>)
+{
+	my $line = $_;
+	chomp($line);
+	my @infos = split(/\t/,$line);
+
+	if (scalar @infos > 9)
+        {
+		if (!/^#/)
+		{
+			last;
+		}		
+	}
+	print $O1 $_;
+}
+close($O1);
+
+if ($filter_synonyme =~/\w/)
+{
+	if ($filter_synonyme eq "s")
+	{
+		system("grep ',SYNONYMOUS_' $input >$out.f1");
+		system("grep '=SYNONYMOUS_' $input >$out.f2");
+	}
+	if ($filter_synonyme eq "n")
+        {
+		system("grep ',NON_SYNONYMOUS_' $input >$out.f1");
+		system("grep '=NON_SYNONYMOUS_' $input >$out.f2");
+	}	
+	system("cat $out.header >$out.r1");
+	system("cat $out.f1 >>$out.r1");
+	system("cat $out.f2 >>$out.r1");
+}
+else
+{
+	system("cp -rf $input $out.r1");
+}
+if ($filter_position =~/\w+/)
+{
+	system("grep $filter_position $out.r1 >$out.f3");
+	system("cat $out.header >>$out.r2");
+	system("cat $out.f3 >>$out.r2");
+}
+else
+{
+	system("cp -rf $out.r1 $out.r2");
+}
+
+if ($genelist)
+{
+	my %genes_to_be_analyzed;
+	open(my $F,$genelist);
+	while(<$F>)
+	{
+		my $line = $_;
+		$line =~s/\n//g;
+		$line =~s/\r//g;
+		$genes_to_be_analyzed{$line} = 1;
+	}
+	close($F);
+
+	my $num_line = 0;
+	open(my $OUT,">$out");
+	open(my $VCF,"$out.r2");
+	while(<$VCF>)
+	{
+		my $line = $_;
+		chomp($line);
+		my @infos = split(/\t/,$line);
+		
+		if (scalar @infos > 9)
+		{
+			if (!/^#/)
+			{
+				$num_line++;
+				
+				my $chromosome = $infos[0];
+				my $chromosome_position = $infos[1];
+				my $ref_allele = $infos[3];
+				my $alt_allele = $infos[4];
+				my $quality = $infos[6];
+				my $info = $infos[7];
+				my $is_in_exon = "#";
+				my $is_synonyme = "#";
+				my $gene = "intergenic";
+				my $modif_codon = "#";
+				my $modif_aa = "#";
+				my $geneposition;
+				if ($info =~/EFF=(.*)/)
+				{
+					my @annotations = split(",",$1);
+					foreach my $annotation(@annotations)
+					{
+						my ($syn, $additional) = split(/\(/,$annotation);
+	
+
+						$is_in_exon = "intron";
+						if ($syn =~/UTR/)
+						{
+							$is_in_exon = $syn;
+						}
+						else
+						{
+							$is_synonyme = $syn;
+						}
+						my @infos_additional = split(/\|/,$additional);
+						$gene = $infos_additional[4];
+					}
+				}
+
+				if (!$genes_to_be_analyzed{$gene}){next;}	
+			}
+		}
+		print $OUT $_;
+	}
+	close($VCF);
+	close($OUT);
+}
+else
+{
+	system("cp $out.r2 $out");
+}
+unlink("$out.f1");
+unlink("$out.f2");	
+unlink("$out.f3");
+unlink("$out.r1");
+unlink("$out.r2");
+unlink("$out.header");