annotate GetHaplotypesFromPhasedVCF/GetHaplotypesFromPhasedVCF.pl @ 13:734a3572c1d6 draft

Uploaded
author dereeper
date Tue, 08 Jan 2019 08:45:34 -0500
parents 88748d846a20
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
1 #!/usr/bin/perl
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
2
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
3 use strict;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
4
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
5 my $vcf = $ARGV[0];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
6 my $out = $ARGV[1];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
7
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
8 open(O1,">$out.haplo.fas");
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
9 open(O2,">$out.distinct_haplotypes.txt");
12
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
10 open(O3,">$out.distinct_haplotypes.fa");
6
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
11
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
12 my %indiv;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
13 my %genes;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
14 my $nb_cols = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
15 my %phasing;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
16 open(my $V,$vcf);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
17 while(<$V>){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
18 my $line = $_;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
19 $line =~s/\n//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
20 $line =~s/\r//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
21 my @infos = split(/\t/,$line);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
22 if (/#CHROM/){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
23 for (my $i = 9; $i <= $#infos; $i++){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
24 $indiv{$i} = $infos[$i];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
25 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
26 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
27 else{
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
28 my $gene = $infos[0];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
29 my $ref = $infos[3];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
30 my $alt = $infos[4];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
31 $nb_cols = $#infos;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
32 for (my $i = 9; $i <= $#infos; $i++){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
33 my ($alleles,$depth) = split(":",$infos[$i]);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
34 $alleles =~s/0/$ref/g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
35 $alleles =~s/1/$alt/g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
36 my $ind = $indiv{$i};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
37 #if ($alleles =~/\// && $genes{$gene}){next;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
38 $phasing{$gene}{$i}.= $alleles ." ";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
39 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
40 $genes{$gene} = 1;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
41 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
42 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
43 close($V);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
44
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
45 my %haplos;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
46 my $gene_ok = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
47 my $gene_shared = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
48 my $gene_all_shared = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
49 my $nb_gene = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
50 my %haplotypes2;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
51 foreach my $gene(keys(%phasing))
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
52 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
53 my $list = "";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
54 my $ok = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
55 my %haplotypes;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
56 for (my $i=9;$i <= $nb_cols;$i++){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
57
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
58 my $alleles = $phasing{$gene}{$i};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
59 if ($alleles eq "" or $alleles =~ /\./ or $alleles =~/ \w+\//){next;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
60 my @snps = split(" ",$alleles);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
61 if (scalar @snps < 2){next;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
62 $alleles =~s/\//\|/g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
63 my @al = split(" ",$alleles);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
64 my $haplo1 = "";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
65 my $haplo2 = "";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
66 foreach my $a(@al){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
67 my ($a1,$a2) = split(/\|/,$a);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
68 $haplo1.= $a1;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
69 $haplo2.= $a2;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
70 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
71 $haplotypes{$haplo1}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
72 $haplotypes{$haplo2}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
73 my $ind = $indiv{$i};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
74 $haplotypes2{$gene}{$haplo1}.=$ind."_1,";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
75 $haplotypes2{$gene}{$haplo2}.=$ind."_2,";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
76 $haplos{$gene}{$haplo1}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
77 $haplos{$gene}{$haplo2}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
78 $list .= ">".$gene."_".$ind."_1\n$haplo1\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
79 $list .= ">".$gene."_".$ind."_2\n$haplo2\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
80 $ok++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
81 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
82 #print "$gene $nb_cols $ok\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
83 #if ($ok > 13 && $found_M1_M2 == 2){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
84 if ($ok == ($nb_cols - 8)){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
85 $nb_gene++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
86 print O1 $list;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
87 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
88 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
89 foreach my $gene(sort {$a<=>$b} keys(%haplos)){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
90 print O2 "===$gene===\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
91 my $ref_hash = $haplos{$gene};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
92 my %hash = %$ref_hash;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
93 my $num_haplo = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
94 foreach my $haplo(keys(%hash)){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
95 $num_haplo++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
96 my $haplo_name = "haplo".$num_haplo;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
97 my $nb = $haplos{$gene}{$haplo};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
98 my $ind = $haplotypes2{$gene}{$haplo};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
99 print O2 $haplo_name.":$nb:".$haplotypes2{$gene}{$haplo}."\n".$haplo."\n";
12
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
100 if ($nb >= 1){
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
101 print O3 ">".$haplo_name."|$nb\n";
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
102 print O3 $haplo."\n";
6
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
103 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
104 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
105 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
106
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
107 close(O1);
12
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
108 close(O2);
88748d846a20 planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
dereeper
parents: 6
diff changeset
109 close(O3);
6
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
110 #print scalar keys(%haplos);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
111