comparison VCF2Hapmap/VCF2FastaAndHapmap.pl @ 3:345f88a8f483 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 10:38:43 -0400
parents 420b57c3c185
children
comparison
equal deleted inserted replaced
2:feb40a9a8eae 3:345f88a8f483
1
2 #!/usr/bin/perl
3
4 use strict;
5 use Getopt::Long;
6
7 my $usage = qq~Usage:$0 <args> [<opts>]
8
9 where <args> are:
10
11 -v, --vcf <VCF input>
12 -o, --out <Output basename>
13
14 <opts> are:
15
16 -r, --reference <Reference fasta file>
17 -g, --gff <GFF input file to create alignments of genes>
18 ~;
19 $usage .= "\n";
20
21 my ($input,$out,$reference,$gff);
22
23
24
25 GetOptions(
26 "vcf=s" => \$input,
27 "out=s" => \$out,
28 "reference=s" => \$reference,
29 "gff=s" => \$gff
30 );
31
32
33 die $usage
34 if ( !$input || !$out);
35
36 if ($gff && !$reference)
37 {
38 die "You must provide a Fasta reference file when providing GFF annotation\n";
39 }
40
41
42 my %ref_sequences;
43 if ($reference)
44 {
45 my $id;
46 my $sequence = "";
47 open(my $R,$reference) or die "cannot open file: $reference";
48 while(<$R>)
49 {
50 my $line =$_;
51 $line =~s/\n//g;
52 $line =~s/\r//g;
53 if ($line =~ />([^\s]+)/){
54 $ref_sequences{$id} = $sequence;
55 $id=$1;$sequence="";
56 }
57 else
58 {
59 $sequence .= $line;
60 }
61 }
62 close($R);
63 $ref_sequences{$id} = $sequence;
64 }
65
66
67 my %chr_of_gene;
68 my %ann;
69 if ($gff)
70 {
71 open(my $G,$gff) or die "cannot open file: $gff";
72 while(<$G>)
73 {
74 my $line =$_;
75 $line =~s/\n//g;
76 $line =~s/\r//g;
77 my @i = split(/\t/,$line);
78 my $chr = $i[0];
79 my $feature = $i[2];
80 my $strand = $i[6];
81 my $start = $i[3];
82 my $stop = $i[4];
83 my $inf = $i[8];
84 if ($feature eq 'gene')
85 {
86 if ($inf =~/Name=([\w\-\.]+)[;\s]*/){$inf = $1;}
87 $ann{$inf}{"start"}=$start;
88 $ann{$inf}{"stop"}=$stop;
89 $ann{$inf}{"strand"}=$strand;
90 $chr_of_gene{$inf} = $chr;
91 }
92 }
93 close($G);
94 }
95
96
97
98 my %IUPAC =
99 (
100 '[A/G]'=> "R",
101 '[G/A]'=> "R",
102 '[C/T]'=> "Y",
103 '[T/C]'=> "Y",
104 '[T/G]'=> "K",
105 '[G/T]'=> "K",
106 '[C/G]'=> "S",
107 '[G/C]'=> "S",
108 '[A/T]'=> "W",
109 '[T/A]'=> "W",
110 '[A/C]'=> "M",
111 '[C/A]'=> "M",
112 '[C/A/T]'=> "H",
113 '[A/T/C]'=> "H",
114 '[A/C/T]'=> "H",
115 '[C/T/A]'=> "H",
116 '[T/C/A]'=> "H",
117 '[T/A/C]'=> "H",
118 '[C/A/G]'=> "V",
119 '[A/G/C]'=> "V",
120 '[A/C/G]'=> "V",
121 '[C/G/A]'=> "V",
122 '[G/C/A]'=> "V",
123 '[G/A/C]'=> "V",
124 '[C/T/G]'=> "B",
125 '[T/G/C]'=> "B",
126 '[T/C/G]'=> "B",
127 '[C/G/T]'=> "B",
128 '[G/C/T]'=> "B",
129 '[G/T/C]'=> "B",
130 '[T/A/G]'=> "D",
131 '[A/G/T]'=> "D",
132 '[A/T/G]'=> "D",
133 '[T/G/A]'=> "D",
134 '[G/T/A]'=> "D",
135 '[G/A/T]'=> "D",
136 );
137
138 my %snps_of_gene;
139 my %snps_of_gene2;
140 my %indiv_order;
141 my $indiv_list;
142 my %genotyping_infos;
143 my $num_line = 0;
144 my $genename_rank_in_snpeff = 4;
145
146 my $find_annotations = `grep -c 'EFF=' $input`;
147
148 open(my $HAPMAP,">$out.hapmap");
149 print $HAPMAP "rs# alleles chrom pos gene feature effect codon_change amino_acid_change MAF missing_data";
150 open(my $VCF,$input);
151 while(<$VCF>)
152 {
153 my $line = $_;
154 chomp($line);
155 my @infos = split(/\t/,$line);
156
157 if (/^##INFO=\<ID=EFF/ && /Amino_Acid_length \| Gene_Name \| Transcript_BioType \| Gene_Coding/)
158 {
159 $genename_rank_in_snpeff = 8;
160 }
161
162 if (scalar @infos > 9)
163 {
164 if (/#CHROM/)
165 {
166 for (my $j=9;$j<=$#infos;$j++)
167 {
168 my $individu = $infos[$j];
169 $indiv_list .= " $individu";
170 $indiv_order{$j} = $individu;
171 }
172 print $HAPMAP "$indiv_list\n";
173 }
174 elsif (!/^#/)
175 {
176 $num_line++;
177
178 my $chromosome = $infos[0];
179 my $chromosome_position = $infos[1];
180 my $ref_allele = $infos[3];
181 my $alt_allele = $infos[4];
182
183 if ($ref_allele =~/\w\w+/)
184 {
185 $ref_allele = "A";
186 $alt_allele = "T";
187 }
188 elsif ($alt_allele =~/\w\w+/)
189 {
190 $ref_allele = "T";
191 $alt_allele = "A";
192 }
193
194 my $info = $infos[7];
195 my $is_in_exon = "#";
196 my $is_synonyme = "#";
197 my $gene;
198 if ($find_annotations > 1)
199 {
200 $gene = "intergenic";
201 }
202 else
203 {
204 $gene = $chromosome;
205 }
206 my $modif_codon = "#";
207 my $modif_aa = "#";
208 my $geneposition;
209 if ($info =~/EFF=(.*)/)
210 {
211 my @annotations = split(",",$1);
212 foreach my $annotation(@annotations)
213 {
214 my ($syn, $additional) = split(/\(/,$annotation);
215
216 if ($syn =~/STREAM/){next;}
217
218 $is_in_exon = "exon";
219 if ($syn =~/UTR/)
220 {
221 $is_in_exon = $syn;
222 }
223 else
224 {
225 $is_synonyme = $syn;
226 }
227 my @infos_additional = split(/\|/,$additional);
228 $gene = $infos_additional[$genename_rank_in_snpeff];
229 $modif_codon = $infos_additional[2];
230 $modif_aa = $infos_additional[3];
231
232 if ($syn =~/INTERGENIC/)
233 {
234 $is_synonyme = "#";
235 $gene = "intergenic";
236 $is_in_exon = "#";
237 }
238 elsif ($syn =~/SYNONYM/)
239 {
240 $is_in_exon = "exon";
241 }
242 elsif ($syn =~/INTRON/ or $syn =~/SPLICE_SITE_DONOR/)
243 {
244 $is_in_exon = "intron";
245 $is_synonyme = "#";
246 }
247
248
249 if ($modif_aa =~/(\w)(\d+)$/)
250 {
251 $modif_aa = "$1/$1";
252 }
253 elsif ($modif_aa =~/(\w)(\d+)(\w)/)
254 {
255 $modif_aa = "$1/$3";
256 }
257 if ($infos_additional[8] =~/Exon/)
258 {
259 $is_in_exon = "exon";
260 }
261
262 if (!$modif_aa){$modif_aa="#";}
263 if (!$modif_codon){$modif_codon="#";}
264 }
265 }
266 $gene =~s/\.\d//g;
267 if ($ann{$gene}{"start"})
268 {
269 my $strand = $ann{$gene}{"strand"};
270 if ($strand eq '-')
271 {
272 $geneposition = $ann{$gene}{"stop"} - $chromosome_position;
273 }
274 else
275 {
276 $geneposition = $chromosome_position - $ann{$gene}{"start"};
277 }
278 }
279 #if ($info =~/GenePos=(\d+);/)
280 #{
281 # $geneposition = $1;
282 #}
283 my $ratio_missing_data;
284 my $snp_frequency;
285 my $genotyping = "";
286
287 if (2 > 1)
288 {
289
290 $genotyping_infos{"ref"} = "$ref_allele$ref_allele";
291
292 my %alleles_found;
293 my $nb_readable_ind = 0;
294 for (my $i = 9;$i <= $#infos;$i++)
295 {
296 my $dnasample = $indiv_order{$i};
297 my @infos_alleles = split(":",$infos[$i]);
298 my $genotype = $infos_alleles[0];
299 $genotype =~s/0/$ref_allele/g;
300
301 if ($alt_allele =~/,/)
302 {
303 my @alt_alleles = split(",",$alt_allele);
304 my $num_all = 1;
305 foreach my $alt_al(@alt_alleles)
306 {
307 $genotype =~s/$num_all/$alt_al/g;
308 $num_all++;
309 }
310 }
311 else
312 {
313 $genotype =~s/1/$alt_allele/g;
314 }
315 if ($genotype eq '.'){$genotype = "./.";}
316 $genotype =~s/\./N/g;
317 if ($genotype !~/N\/N/)
318 {
319 $nb_readable_ind++;
320 }
321 my @alleles;
322 if ($genotype =~/\//)
323 {
324 @alleles = split(/\//,$genotype);
325 }
326 else
327 {
328 @alleles = split(/\|/,$genotype);
329 }
330 $genotyping .= join("",@alleles) . " ";
331 $genotyping_infos{$dnasample} = join("",@alleles);
332
333 foreach my $al(@alleles)
334 {
335 if ($al ne 'N'){$alleles_found{$al}++;}
336 }
337 }
338 chop($genotyping);
339
340 $snp_frequency = 0;
341
342 my $max = 0;
343 my $min = 10000000;
344 my $total = 0;
345 foreach my $al(keys(%alleles_found))
346 {
347 my $nb = $alleles_found{$al};
348 $total+= $nb;
349 if ($nb > $max)
350 {
351 $max = $nb;
352 }
353 if ($nb < $min)
354 {
355 $min = $nb;
356 }
357 }
358 if ($total > 0)
359 {
360 $snp_frequency = sprintf("%.1f",($min/$total)*100);
361 }
362
363 $ratio_missing_data = 100 - ($nb_readable_ind / ($#infos - 8)) * 100;
364 $ratio_missing_data = sprintf("%.1f",$ratio_missing_data);
365
366 foreach my $dna(keys(%genotyping_infos))
367 {
368 $snps_of_gene{$gene}{$geneposition}{$dna} = $genotyping_infos{$dna};
369 }
370 }
371 my $snp_type = "[$ref_allele/$alt_allele]";
372 $snps_of_gene2{$chromosome}{$chromosome_position} = $snp_type;
373 #print $HAPMAP "$chromosome:$chromosome_position\t$snp_type\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$nb_readable_ind\t$genotyping\n";
374 print $HAPMAP "$chromosome:$chromosome_position\t$ref_allele/$alt_allele\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$ratio_missing_data%\t$genotyping\n";
375 }
376 }
377 }
378 close($VCF);
379 close($HAPMAP);
380
381
382 if (!$reference){exit;}
383
384 #################################################################
385 # generate flanking sequences for Illumina VeraCode technology
386 #################################################################
387 open(my $FLANKING,">$out.flanking.txt");
388 foreach my $seq(keys(%ref_sequences))
389 {
390 if ($snps_of_gene2{$seq})
391 {
392 my $refhash = $snps_of_gene2{$seq};
393 my %hashreal = %$refhash;
394
395 # create consensus
396 my $refseq = $ref_sequences{$seq};
397 my $consensus = "";
398 my $previous = 0;
399 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
400 {
401 my $length = $pos - $previous - 1;
402 $consensus .= substr($refseq,$previous,$length);
403 my $iupac_code = $IUPAC{$snps_of_gene2{$seq}{$pos}};
404 $consensus .= $iupac_code;
405 $previous = $pos;
406 }
407 my $length = length($refseq) - $previous;
408 $consensus .= substr($refseq,$previous,$length);
409
410 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
411 {
412 my $snp_name = "$seq-$pos";
413 my $flanking_length = 60;
414 my $length = $flanking_length;
415 my $start = $pos - $flanking_length - 1;
416 if ($pos <= $flanking_length)
417 {
418 $length = $pos - 1;
419 $start = 0;
420 }
421 my $sequence = substr($consensus,$start,$length);
422
423 $sequence .= $snps_of_gene2{$seq}{$pos};
424 $sequence .= substr($consensus,$pos,$flanking_length);
425 print $FLANKING "$snp_name,$sequence,0,0,0,Project_name,0,diploid,Other,Forward\n";
426 }
427 }
428 }
429
430 close($FLANKING);
431
432 if (!$gff){exit;}
433
434 my @individuals_list = split(/\t/,$indiv_list);
435 if ((scalar @individuals_list * scalar keys(%snps_of_gene)) > 800000)
436 {
437 print "Sorry, too many sequences to manage ...\n";
438 exit;
439 }
440
441 open(my $ALIGN_EGGLIB,">$out.gene_alignment.fas");
442 my %alignments_ind;
443 foreach my $seq(keys(%snps_of_gene))
444 {
445 if ($snps_of_gene{$seq})
446 {
447 my $refhash = $snps_of_gene{$seq};
448 my %hashreal = %$refhash;
449
450 # get flanking sequences
451 my %flanking5;
452 my $start = $ann{$seq}{"start"};
453 my $stop = $ann{$seq}{"stop"};
454 my $strand = $ann{$seq}{"strand"};
455 my $genelength = $stop - $start+1;
456 my $chr = $chr_of_gene{$seq};
457 my $refseq = substr($ref_sequences{$chr},$start-1,$genelength);
458 if ($strand eq '-')
459 {
460 $refseq =~ tr /atcgATCG/tagcTAGC/; $refseq = reverse($refseq);
461 }
462 #print "$seq $chr $start $stop $refseq \n";
463 my $previous = 0;
464 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
465 {
466 my $length = $pos - $previous - 1;
467 $flanking5{$pos} = substr($refseq,$previous,$length);
468 $previous = $pos;
469 }
470 my $length = length($refseq) - $previous;
471 my $flanking3 = substr($refseq,$previous,$length);
472 foreach my $ind(@individuals_list)
473 {
474 my $nb_missing_data_for_this_individual = 0;
475 if ($ind)
476 {
477 my $alignment_for_ind = "";
478 my $seq_without_underscore = $seq;
479 $seq_without_underscore =~s/_//g;
480 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_1\n";
481 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
482 {
483 $alignment_for_ind .= $flanking5{$pos};
484 my $geno = $snps_of_gene{$seq}{$pos}{$ind};
485 $geno =~s/N/?/g;
486 if ($geno =~/\?/){$nb_missing_data_for_this_individual++;}
487 my @alleles = split("",$geno);
488 $alignment_for_ind .= $alleles[0];
489 if ($alleles[0] eq $alleles[1])
490 {
491 $alignments_ind{$ind} .= $alleles[1];
492 }
493 else
494 {
495 my $snp_type = "[" . $alleles[0] . "/" . $alleles[1] . "]";
496 $alignments_ind{$ind} .= $IUPAC{$snp_type};
497 }
498 }
499 $alignment_for_ind .= $flanking3;
500 $alignment_for_ind .= "\n";
501
502
503 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_2\n";
504 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
505 {
506 $alignment_for_ind .= $flanking5{$pos};
507 my $geno = $snps_of_gene{$seq}{$pos}{$ind};
508 $geno =~s/N/?/g;
509 my @alleles = split("",$geno);
510 $alignment_for_ind .= $alleles[1];
511 }
512 $alignment_for_ind .= $flanking3;
513 $alignment_for_ind .= "\n";
514 if (keys(%hashreal) != $nb_missing_data_for_this_individual)
515 {
516 print $ALIGN_EGGLIB $alignment_for_ind;
517 }
518 }
519 }
520 }
521 }
522 close($ALIGN_EGGLIB);
523
524
525