2
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use Getopt::Long;
|
|
5
|
|
6 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
7
|
|
8 where <args> are:
|
|
9
|
|
10 -i, --input <VCF input>
|
|
11 -o, --out <VCF output>
|
|
12
|
|
13 <opts> are:
|
|
14
|
|
15 -g, --genelist <file listing the genes to be filtered>
|
|
16 -f, --feature <genomic feature: Exon,INTRON,UTR_3_PRIME,UTR_5_PRIME,DOWNSTREAM,UPSTREAM or INTERGENIC>
|
|
17 -s, --syn <keep only synonymous (s), non-synonymous (n)>
|
|
18 ~;
|
|
19 $usage .= "\n";
|
|
20
|
|
21 my ($input,$out);
|
|
22
|
|
23 my $filter_position = "";
|
|
24 my $filter_synonyme = "";
|
|
25
|
|
26 my $genelist;
|
|
27
|
|
28 GetOptions(
|
|
29 "input=s" => \$input,
|
|
30 "out=s" => \$out,
|
|
31 "genelist=s" => \$genelist,
|
|
32 "feature=s" => \$filter_position,
|
|
33 "syn=s" => \$filter_synonyme,
|
|
34 );
|
|
35
|
|
36
|
|
37 die $usage
|
|
38 if ( !$input || !$out);
|
|
39
|
|
40
|
|
41 open(my $O1,">$out.header");
|
|
42 my $header = "";
|
|
43 open(my $VCF,$input);
|
|
44 while(<$VCF>)
|
|
45 {
|
|
46 my $line = $_;
|
|
47 chomp($line);
|
|
48 my @infos = split(/\t/,$line);
|
|
49
|
|
50 if (scalar @infos > 9)
|
|
51 {
|
|
52 if (!/^#/)
|
|
53 {
|
|
54 last;
|
|
55 }
|
|
56 }
|
|
57 print $O1 $_;
|
|
58 }
|
|
59 close($O1);
|
|
60
|
|
61 if ($filter_synonyme =~/\w/)
|
|
62 {
|
|
63 if ($filter_synonyme eq "s")
|
|
64 {
|
|
65 system("grep ',SYNONYMOUS_' $input >$out.f1");
|
|
66 system("grep '=SYNONYMOUS_' $input >$out.f2");
|
|
67 }
|
|
68 if ($filter_synonyme eq "n")
|
|
69 {
|
|
70 system("grep ',NON_SYNONYMOUS_' $input >$out.f1");
|
|
71 system("grep '=NON_SYNONYMOUS_' $input >$out.f2");
|
|
72 }
|
|
73 system("cat $out.header >$out.r1");
|
|
74 system("cat $out.f1 >>$out.r1");
|
|
75 system("cat $out.f2 >>$out.r1");
|
|
76 }
|
|
77 else
|
|
78 {
|
|
79 system("cp -rf $input $out.r1");
|
|
80 }
|
|
81 if ($filter_position =~/\w+/)
|
|
82 {
|
|
83 system("grep $filter_position $out.r1 >$out.f3");
|
|
84 system("cat $out.header >>$out.r2");
|
|
85 system("cat $out.f3 >>$out.r2");
|
|
86 }
|
|
87 else
|
|
88 {
|
|
89 system("cp -rf $out.r1 $out.r2");
|
|
90 }
|
|
91
|
|
92 if ($genelist)
|
|
93 {
|
|
94 my %genes_to_be_analyzed;
|
|
95 open(my $F,$genelist);
|
|
96 while(<$F>)
|
|
97 {
|
|
98 my $line = $_;
|
|
99 $line =~s/\n//g;
|
|
100 $line =~s/\r//g;
|
|
101 $genes_to_be_analyzed{$line} = 1;
|
|
102 }
|
|
103 close($F);
|
|
104
|
|
105 my $num_line = 0;
|
|
106 open(my $OUT,">$out");
|
|
107 open(my $VCF,"$out.r2");
|
|
108 while(<$VCF>)
|
|
109 {
|
|
110 my $line = $_;
|
|
111 chomp($line);
|
|
112 my @infos = split(/\t/,$line);
|
|
113
|
|
114 if (scalar @infos > 9)
|
|
115 {
|
|
116 if (!/^#/)
|
|
117 {
|
|
118 $num_line++;
|
|
119
|
|
120 my $chromosome = $infos[0];
|
|
121 my $chromosome_position = $infos[1];
|
|
122 my $ref_allele = $infos[3];
|
|
123 my $alt_allele = $infos[4];
|
|
124 my $quality = $infos[6];
|
|
125 my $info = $infos[7];
|
|
126 my $is_in_exon = "#";
|
|
127 my $is_synonyme = "#";
|
|
128 my $gene = "intergenic";
|
|
129 my $modif_codon = "#";
|
|
130 my $modif_aa = "#";
|
|
131 my $geneposition;
|
|
132 if ($info =~/EFF=(.*)/)
|
|
133 {
|
|
134 my @annotations = split(",",$1);
|
|
135 foreach my $annotation(@annotations)
|
|
136 {
|
|
137 my ($syn, $additional) = split(/\(/,$annotation);
|
|
138
|
|
139
|
|
140 $is_in_exon = "intron";
|
|
141 if ($syn =~/UTR/)
|
|
142 {
|
|
143 $is_in_exon = $syn;
|
|
144 }
|
|
145 else
|
|
146 {
|
|
147 $is_synonyme = $syn;
|
|
148 }
|
|
149 my @infos_additional = split(/\|/,$additional);
|
|
150 $gene = $infos_additional[4];
|
|
151 }
|
|
152 }
|
|
153
|
|
154 if (!$genes_to_be_analyzed{$gene}){next;}
|
|
155 }
|
|
156 }
|
|
157 print $OUT $_;
|
|
158 }
|
|
159 close($VCF);
|
|
160 close($OUT);
|
|
161 }
|
|
162 else
|
|
163 {
|
|
164 system("cp $out.r2 $out");
|
|
165 }
|
|
166 unlink("$out.f1");
|
|
167 unlink("$out.f2");
|
|
168 unlink("$out.f3");
|
|
169 unlink("$out.r1");
|
|
170 unlink("$out.r2");
|
|
171 unlink("$out.header");
|