annotate FilterVCFonAnnotations.pl @ 2:15319113c0a5 draft

Uploaded
author dereeper
date Thu, 12 Feb 2015 15:54:24 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
2
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
3 use strict;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
4 use Getopt::Long;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
5
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
6 my $usage = qq~Usage:$0 <args> [<opts>]
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
7
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
8 where <args> are:
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
9
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
10 -i, --input <VCF input>
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
11 -o, --out <VCF output>
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
12
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
13 <opts> are:
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
14
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
15 -g, --genelist <file listing the genes to be filtered>
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
16 -f, --feature <genomic feature: Exon,INTRON,UTR_3_PRIME,UTR_5_PRIME,DOWNSTREAM,UPSTREAM or INTERGENIC>
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
17 -s, --syn <keep only synonymous (s), non-synonymous (n)>
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
18 ~;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
19 $usage .= "\n";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
20
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
21 my ($input,$out);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
22
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
23 my $filter_position = "";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
24 my $filter_synonyme = "";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
25
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
26 my $genelist;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
27
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
28 GetOptions(
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
29 "input=s" => \$input,
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
30 "out=s" => \$out,
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
31 "genelist=s" => \$genelist,
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
32 "feature=s" => \$filter_position,
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
33 "syn=s" => \$filter_synonyme,
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
34 );
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
35
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
36
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
37 die $usage
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
38 if ( !$input || !$out);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
39
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
40
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
41 open(my $O1,">$out.header");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
42 my $header = "";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
43 open(my $VCF,$input);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
44 while(<$VCF>)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
45 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
46 my $line = $_;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
47 chomp($line);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
48 my @infos = split(/\t/,$line);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
49
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
50 if (scalar @infos > 9)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
51 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
52 if (!/^#/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
53 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
54 last;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
55 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
56 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
57 print $O1 $_;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
58 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
59 close($O1);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
60
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
61 if ($filter_synonyme =~/\w/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
62 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
63 if ($filter_synonyme eq "s")
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
64 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
65 system("grep ',SYNONYMOUS_' $input >$out.f1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
66 system("grep '=SYNONYMOUS_' $input >$out.f2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
67 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
68 if ($filter_synonyme eq "n")
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
69 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
70 system("grep ',NON_SYNONYMOUS_' $input >$out.f1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
71 system("grep '=NON_SYNONYMOUS_' $input >$out.f2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
72 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
73 system("cat $out.header >$out.r1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
74 system("cat $out.f1 >>$out.r1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
75 system("cat $out.f2 >>$out.r1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
76 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
77 else
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
78 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
79 system("cp -rf $input $out.r1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
80 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
81 if ($filter_position =~/\w+/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
82 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
83 system("grep $filter_position $out.r1 >$out.f3");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
84 system("cat $out.header >>$out.r2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
85 system("cat $out.f3 >>$out.r2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
86 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
87 else
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
88 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
89 system("cp -rf $out.r1 $out.r2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
90 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
91
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
92 if ($genelist)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
93 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
94 my %genes_to_be_analyzed;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
95 open(my $F,$genelist);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
96 while(<$F>)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
97 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
98 my $line = $_;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
99 $line =~s/\n//g;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
100 $line =~s/\r//g;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
101 $genes_to_be_analyzed{$line} = 1;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
102 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
103 close($F);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
104
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
105 my $num_line = 0;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
106 open(my $OUT,">$out");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
107 open(my $VCF,"$out.r2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
108 while(<$VCF>)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
109 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
110 my $line = $_;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
111 chomp($line);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
112 my @infos = split(/\t/,$line);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
113
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
114 if (scalar @infos > 9)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
115 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
116 if (!/^#/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
117 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
118 $num_line++;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
119
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
120 my $chromosome = $infos[0];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
121 my $chromosome_position = $infos[1];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
122 my $ref_allele = $infos[3];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
123 my $alt_allele = $infos[4];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
124 my $quality = $infos[6];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
125 my $info = $infos[7];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
126 my $is_in_exon = "#";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
127 my $is_synonyme = "#";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
128 my $gene = "intergenic";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
129 my $modif_codon = "#";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
130 my $modif_aa = "#";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
131 my $geneposition;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
132 if ($info =~/EFF=(.*)/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
133 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
134 my @annotations = split(",",$1);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
135 foreach my $annotation(@annotations)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
136 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
137 my ($syn, $additional) = split(/\(/,$annotation);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
138
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
139
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
140 $is_in_exon = "intron";
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
141 if ($syn =~/UTR/)
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
142 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
143 $is_in_exon = $syn;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
144 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
145 else
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
146 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
147 $is_synonyme = $syn;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
148 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
149 my @infos_additional = split(/\|/,$additional);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
150 $gene = $infos_additional[4];
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
151 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
152 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
153
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
154 if (!$genes_to_be_analyzed{$gene}){next;}
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
155 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
156 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
157 print $OUT $_;
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
158 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
159 close($VCF);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
160 close($OUT);
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
161 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
162 else
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
163 {
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
164 system("cp $out.r2 $out");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
165 }
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
166 unlink("$out.f1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
167 unlink("$out.f2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
168 unlink("$out.f3");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
169 unlink("$out.r1");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
170 unlink("$out.r2");
15319113c0a5 Uploaded
dereeper
parents:
diff changeset
171 unlink("$out.header");