diff GetHaplotypesFromPhasedVCF/GetHaplotypesFromPhasedVCF.pl @ 6:ebb0ac9b6fa9 draft

planemo upload
author gandres
date Mon, 23 May 2016 17:49:17 -0400
parents
children 88748d846a20
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GetHaplotypesFromPhasedVCF/GetHaplotypesFromPhasedVCF.pl	Mon May 23 17:49:17 2016 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/perl
+
+use strict;
+
+my $vcf = $ARGV[0];
+my $out = $ARGV[1];
+
+open(O1,">$out.haplo.fas");
+open(O2,">$out.distinct_haplotypes.txt");
+
+my %indiv;
+my %genes;
+my $nb_cols = 0;
+my %phasing;
+open(my $V,$vcf);
+while(<$V>){
+	my $line = $_;
+	$line =~s/\n//g;
+	$line =~s/\r//g;
+	my @infos = split(/\t/,$line);
+	if (/#CHROM/){
+		for (my $i = 9; $i <= $#infos; $i++){
+			$indiv{$i} = $infos[$i];		
+		} 
+	}
+	else{
+		my $gene = $infos[0];
+		my $ref = $infos[3];
+		my $alt = $infos[4];
+		$nb_cols = $#infos;
+		for (my $i = 9; $i <= $#infos; $i++){
+			my ($alleles,$depth) = split(":",$infos[$i]);
+			$alleles =~s/0/$ref/g;
+			$alleles =~s/1/$alt/g;
+			my $ind = $indiv{$i};
+			#if ($alleles =~/\// && $genes{$gene}){next;}
+			$phasing{$gene}{$i}.= $alleles ." ";
+		}
+		$genes{$gene} = 1;
+	}
+}
+close($V);
+
+my %haplos;
+my $gene_ok = 0;
+my $gene_shared = 0;
+my $gene_all_shared = 0;
+my $nb_gene = 0;
+my %haplotypes2;
+foreach my $gene(keys(%phasing))
+{
+	my $list = "";
+	my $ok = 0;
+	my %haplotypes;
+	for (my $i=9;$i <= $nb_cols;$i++){
+		
+		my $alleles = $phasing{$gene}{$i};
+		if ($alleles eq "" or $alleles =~ /\./ or $alleles =~/ \w+\//){next;}
+		my @snps = split(" ",$alleles);
+		if (scalar @snps < 2){next;}
+		$alleles =~s/\//\|/g;	
+		my @al = split(" ",$alleles);
+		my $haplo1 = "";
+		my $haplo2 = "";
+		foreach my $a(@al){
+			my ($a1,$a2) = split(/\|/,$a);
+			$haplo1.= $a1;
+			$haplo2.= $a2;
+		}
+		$haplotypes{$haplo1}++;
+		$haplotypes{$haplo2}++;
+		my $ind = $indiv{$i};
+		$haplotypes2{$gene}{$haplo1}.=$ind."_1,";
+		$haplotypes2{$gene}{$haplo2}.=$ind."_2,";
+		$haplos{$gene}{$haplo1}++;
+		$haplos{$gene}{$haplo2}++;
+		$list .= ">".$gene."_".$ind."_1\n$haplo1\n";
+		$list .= ">".$gene."_".$ind."_2\n$haplo2\n";
+		$ok++;
+	}
+	#print "$gene $nb_cols $ok\n";
+	#if ($ok > 13 && $found_M1_M2 == 2){
+	if ($ok == ($nb_cols - 8)){
+		$nb_gene++;
+		print O1 $list;
+	}
+}
+foreach my $gene(sort {$a<=>$b} keys(%haplos)){
+	print O2 "===$gene===\n";
+	my $ref_hash = $haplos{$gene};
+	my %hash = %$ref_hash;
+	my $num_haplo = 0;
+	foreach my $haplo(keys(%hash)){
+		$num_haplo++;
+		my $haplo_name = "haplo".$num_haplo;
+		my $nb = $haplos{$gene}{$haplo};
+		my $ind = $haplotypes2{$gene}{$haplo};
+		print O2 $haplo_name.":$nb:".$haplotypes2{$gene}{$haplo}."\n".$haplo."\n"; 
+		if ($nb > 1){
+			#print "$nb \n";
+		}
+	}
+}
+
+close(O1);
+close(O1);
+#print scalar keys(%haplos);
+