6
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4
|
|
5 my $vcf = $ARGV[0];
|
|
6 my $out = $ARGV[1];
|
|
7
|
|
8 open(O1,">$out.haplo.fas");
|
|
9 open(O2,">$out.distinct_haplotypes.txt");
|
|
10
|
|
11 my %indiv;
|
|
12 my %genes;
|
|
13 my $nb_cols = 0;
|
|
14 my %phasing;
|
|
15 open(my $V,$vcf);
|
|
16 while(<$V>){
|
|
17 my $line = $_;
|
|
18 $line =~s/\n//g;
|
|
19 $line =~s/\r//g;
|
|
20 my @infos = split(/\t/,$line);
|
|
21 if (/#CHROM/){
|
|
22 for (my $i = 9; $i <= $#infos; $i++){
|
|
23 $indiv{$i} = $infos[$i];
|
|
24 }
|
|
25 }
|
|
26 else{
|
|
27 my $gene = $infos[0];
|
|
28 my $ref = $infos[3];
|
|
29 my $alt = $infos[4];
|
|
30 $nb_cols = $#infos;
|
|
31 for (my $i = 9; $i <= $#infos; $i++){
|
|
32 my ($alleles,$depth) = split(":",$infos[$i]);
|
|
33 $alleles =~s/0/$ref/g;
|
|
34 $alleles =~s/1/$alt/g;
|
|
35 my $ind = $indiv{$i};
|
|
36 #if ($alleles =~/\// && $genes{$gene}){next;}
|
|
37 $phasing{$gene}{$i}.= $alleles ." ";
|
|
38 }
|
|
39 $genes{$gene} = 1;
|
|
40 }
|
|
41 }
|
|
42 close($V);
|
|
43
|
|
44 my %haplos;
|
|
45 my $gene_ok = 0;
|
|
46 my $gene_shared = 0;
|
|
47 my $gene_all_shared = 0;
|
|
48 my $nb_gene = 0;
|
|
49 my %haplotypes2;
|
|
50 foreach my $gene(keys(%phasing))
|
|
51 {
|
|
52 my $list = "";
|
|
53 my $ok = 0;
|
|
54 my %haplotypes;
|
|
55 for (my $i=9;$i <= $nb_cols;$i++){
|
|
56
|
|
57 my $alleles = $phasing{$gene}{$i};
|
|
58 if ($alleles eq "" or $alleles =~ /\./ or $alleles =~/ \w+\//){next;}
|
|
59 my @snps = split(" ",$alleles);
|
|
60 if (scalar @snps < 2){next;}
|
|
61 $alleles =~s/\//\|/g;
|
|
62 my @al = split(" ",$alleles);
|
|
63 my $haplo1 = "";
|
|
64 my $haplo2 = "";
|
|
65 foreach my $a(@al){
|
|
66 my ($a1,$a2) = split(/\|/,$a);
|
|
67 $haplo1.= $a1;
|
|
68 $haplo2.= $a2;
|
|
69 }
|
|
70 $haplotypes{$haplo1}++;
|
|
71 $haplotypes{$haplo2}++;
|
|
72 my $ind = $indiv{$i};
|
|
73 $haplotypes2{$gene}{$haplo1}.=$ind."_1,";
|
|
74 $haplotypes2{$gene}{$haplo2}.=$ind."_2,";
|
|
75 $haplos{$gene}{$haplo1}++;
|
|
76 $haplos{$gene}{$haplo2}++;
|
|
77 $list .= ">".$gene."_".$ind."_1\n$haplo1\n";
|
|
78 $list .= ">".$gene."_".$ind."_2\n$haplo2\n";
|
|
79 $ok++;
|
|
80 }
|
|
81 #print "$gene $nb_cols $ok\n";
|
|
82 #if ($ok > 13 && $found_M1_M2 == 2){
|
|
83 if ($ok == ($nb_cols - 8)){
|
|
84 $nb_gene++;
|
|
85 print O1 $list;
|
|
86 }
|
|
87 }
|
|
88 foreach my $gene(sort {$a<=>$b} keys(%haplos)){
|
|
89 print O2 "===$gene===\n";
|
|
90 my $ref_hash = $haplos{$gene};
|
|
91 my %hash = %$ref_hash;
|
|
92 my $num_haplo = 0;
|
|
93 foreach my $haplo(keys(%hash)){
|
|
94 $num_haplo++;
|
|
95 my $haplo_name = "haplo".$num_haplo;
|
|
96 my $nb = $haplos{$gene}{$haplo};
|
|
97 my $ind = $haplotypes2{$gene}{$haplo};
|
|
98 print O2 $haplo_name.":$nb:".$haplotypes2{$gene}{$haplo}."\n".$haplo."\n";
|
|
99 if ($nb > 1){
|
|
100 #print "$nb \n";
|
|
101 }
|
|
102 }
|
|
103 }
|
|
104
|
|
105 close(O1);
|
|
106 close(O1);
|
|
107 #print scalar keys(%haplos);
|
|
108
|