view VCF2Hapmap/VCF2FastaAndHapmap.pl @ 12:88748d846a20 draft

planemo upload commit 11382afe87364aaafb19973470d5066229a6e34f
author dereeper
date Tue, 14 Aug 2018 08:21:55 -0400
parents 420b57c3c185
children
line wrap: on
line source


#!/usr/bin/perl

use strict;
use Getopt::Long;

my $usage = qq~Usage:$0 <args> [<opts>]

where <args> are:

    -v, --vcf          <VCF input>
    -o, --out          <Output basename>
      
<opts> are:

    -r, --reference    <Reference fasta file>
    -g, --gff          <GFF input file to create alignments of genes>
~;
$usage .= "\n";

my ($input,$out,$reference,$gff);



GetOptions(
	"vcf=s"          => \$input,
	"out=s"          => \$out,
	"reference=s"    => \$reference,
	"gff=s"          => \$gff
);


die $usage
  if ( !$input || !$out);
 
if ($gff && !$reference)
{
	die "You must provide a Fasta reference file when providing GFF annotation\n";
}

 
my %ref_sequences;  
if ($reference)
{
	my $id;
	my $sequence = "";
	open(my $R,$reference) or die "cannot open file: $reference";
	while(<$R>)
	{
		my $line =$_;
		$line =~s/\n//g;
		$line =~s/\r//g;
		if ($line =~ />([^\s]+)/){
			$ref_sequences{$id} = $sequence;
			$id=$1;$sequence="";
		}
		else
		{
			$sequence .= $line;
		}
	}
	close($R);
	$ref_sequences{$id} = $sequence;
}


my %chr_of_gene;
my %ann;
if ($gff)
{
	open(my $G,$gff) or die "cannot open file: $gff";
	while(<$G>)
	{
		my $line =$_;
		$line =~s/\n//g;
		$line =~s/\r//g;
		my @i = split(/\t/,$line);
		my $chr = $i[0];
		my $feature = $i[2];
		my $strand = $i[6];
		my $start = $i[3];
		my $stop = $i[4];
		my $inf = $i[8];
		if ($feature eq 'gene')
		{
			 if ($inf =~/Name=([\w\-\.]+)[;\s]*/){$inf = $1;}
			$ann{$inf}{"start"}=$start;
			$ann{$inf}{"stop"}=$stop;
			$ann{$inf}{"strand"}=$strand;
			$chr_of_gene{$inf} = $chr;
		}
	}
	close($G);
}



my %IUPAC =
(
        '[A/G]'=> "R",
        '[G/A]'=> "R",
        '[C/T]'=> "Y",
        '[T/C]'=> "Y",
        '[T/G]'=> "K",
        '[G/T]'=> "K",
        '[C/G]'=> "S",
        '[G/C]'=> "S",
        '[A/T]'=> "W",
        '[T/A]'=> "W",
        '[A/C]'=> "M",
        '[C/A]'=> "M",
        '[C/A/T]'=> "H",
        '[A/T/C]'=> "H",
        '[A/C/T]'=> "H",
        '[C/T/A]'=> "H",
        '[T/C/A]'=> "H",
        '[T/A/C]'=> "H",
        '[C/A/G]'=> "V",
        '[A/G/C]'=> "V",
        '[A/C/G]'=> "V",
        '[C/G/A]'=> "V",
        '[G/C/A]'=> "V",
        '[G/A/C]'=> "V",
        '[C/T/G]'=> "B",
        '[T/G/C]'=> "B",
        '[T/C/G]'=> "B",
        '[C/G/T]'=> "B",
        '[G/C/T]'=> "B",
        '[G/T/C]'=> "B",
        '[T/A/G]'=> "D",
        '[A/G/T]'=> "D",
        '[A/T/G]'=> "D",
        '[T/G/A]'=> "D",
        '[G/T/A]'=> "D",
        '[G/A/T]'=> "D",
);

my %snps_of_gene;
my %snps_of_gene2;
my %indiv_order;
my $indiv_list;
my %genotyping_infos;
my $num_line = 0;
my $genename_rank_in_snpeff = 4;

my $find_annotations = `grep -c 'EFF=' $input`;

open(my $HAPMAP,">$out.hapmap");
print $HAPMAP "rs#	alleles	chrom	pos	gene	feature	effect	codon_change	amino_acid_change	MAF	missing_data";
open(my $VCF,$input);
while(<$VCF>)
{
	my $line = $_;
	chomp($line);
	my @infos = split(/\t/,$line);
	
	if (/^##INFO=\<ID=EFF/ && /Amino_Acid_length \| Gene_Name \| Transcript_BioType \| Gene_Coding/)
	{
		$genename_rank_in_snpeff = 8;
	}

	if (scalar @infos > 9)
	{
		if (/#CHROM/)
		{
			for (my $j=9;$j<=$#infos;$j++)
			{
				my $individu = $infos[$j];
				$indiv_list .= "	$individu";
				$indiv_order{$j} = $individu;
			}
			print $HAPMAP "$indiv_list\n";
		}
		elsif (!/^#/)
		{
			$num_line++;

			my $chromosome = $infos[0];
			my $chromosome_position = $infos[1];
			my $ref_allele = $infos[3];
			my $alt_allele = $infos[4];
                    	
			if ($ref_allele =~/\w\w+/)
			{
				$ref_allele = "A";
				$alt_allele = "T";
			}
			elsif ($alt_allele =~/\w\w+/)
			{
				$ref_allele = "T";
				$alt_allele = "A";
			}
			
			my $info = $infos[7];
			my $is_in_exon = "#";
			my $is_synonyme = "#";
			my $gene;
			if ($find_annotations > 1)
			{
				$gene = "intergenic";
			}
			else
			{
				$gene = $chromosome;
			}
			my $modif_codon = "#";
			my $modif_aa = "#";
			my $geneposition;
			if ($info =~/EFF=(.*)/)
			{
					my @annotations = split(",",$1);
					foreach my $annotation(@annotations)
					{
						my ($syn, $additional) = split(/\(/,$annotation);

						if ($syn =~/STREAM/){next;}

						$is_in_exon = "exon";
						if ($syn =~/UTR/)
						{
							$is_in_exon = $syn;
						}
						else
						{
							$is_synonyme = $syn;
						}
						my @infos_additional = split(/\|/,$additional);
						$gene = $infos_additional[$genename_rank_in_snpeff];
						$modif_codon = $infos_additional[2];
						$modif_aa = $infos_additional[3];
						
						if ($syn =~/INTERGENIC/)
						{
							$is_synonyme = "#";
							$gene = "intergenic";
							$is_in_exon = "#";
						}
						elsif ($syn =~/SYNONYM/)
						{
							$is_in_exon = "exon";
						}
						elsif ($syn =~/INTRON/ or $syn =~/SPLICE_SITE_DONOR/)
						{
							$is_in_exon = "intron";
							$is_synonyme = "#";
						}
						
						
						if ($modif_aa =~/(\w)(\d+)$/)
						{
							$modif_aa = "$1/$1";
						}
						elsif ($modif_aa =~/(\w)(\d+)(\w)/)
						{
							$modif_aa = "$1/$3";
						}
						if ($infos_additional[8] =~/Exon/)
						{
							$is_in_exon = "exon";
						}
						
						if (!$modif_aa){$modif_aa="#";}
						if (!$modif_codon){$modif_codon="#";}
					}
			}
			$gene =~s/\.\d//g;
			if ($ann{$gene}{"start"})
			{
				my $strand = $ann{$gene}{"strand"};
				if ($strand eq '-')
				{
					$geneposition = $ann{$gene}{"stop"} - $chromosome_position;
				}
				else
				{
					$geneposition = $chromosome_position - $ann{$gene}{"start"};
				}
			}
			#if ($info =~/GenePos=(\d+);/)
			#{
			#	$geneposition = $1;
			#}
			my $ratio_missing_data;
			my $snp_frequency;
			my $genotyping = "";
			
			if (2 > 1)
			{
				
				$genotyping_infos{"ref"} = "$ref_allele$ref_allele";
				
				my %alleles_found;
				my $nb_readable_ind = 0;
				for (my $i = 9;$i <= $#infos;$i++)
				{
					my $dnasample = $indiv_order{$i};
					my @infos_alleles = split(":",$infos[$i]);
					my $genotype = $infos_alleles[0];
					$genotype =~s/0/$ref_allele/g;

					if ($alt_allele =~/,/)
					{
						my @alt_alleles = split(",",$alt_allele);
						my $num_all = 1;
						foreach my $alt_al(@alt_alleles)
						{
							$genotype =~s/$num_all/$alt_al/g;
							$num_all++;
						}
					}
					else
					{
						$genotype =~s/1/$alt_allele/g;
					}
					if ($genotype eq '.'){$genotype = "./.";}
					$genotype =~s/\./N/g;
					if ($genotype !~/N\/N/)
					{
						$nb_readable_ind++;
					}
					my @alleles;
					if ($genotype =~/\//)
					{
						@alleles = split(/\//,$genotype);
					}
					else
					{
						@alleles = split(/\|/,$genotype);
					}
					$genotyping .= join("",@alleles) . "	";
					$genotyping_infos{$dnasample} = join("",@alleles);
					
					foreach my $al(@alleles)
					{
						if ($al ne 'N'){$alleles_found{$al}++;}
					}	
				}
				chop($genotyping);
				
				$snp_frequency = 0;
					
				my $max = 0;
				my $min = 10000000;
				my $total = 0;
				foreach my $al(keys(%alleles_found))
				{
					my $nb = $alleles_found{$al};
					$total+= $nb;
					if ($nb > $max)
					{
						$max = $nb;
					}
					if ($nb < $min)
					{
						$min = $nb;
					}
				}
				if ($total > 0)
				{
					$snp_frequency = sprintf("%.1f",($min/$total)*100);
				}
				
				$ratio_missing_data = 100 - ($nb_readable_ind / ($#infos - 8)) * 100;
				$ratio_missing_data = sprintf("%.1f",$ratio_missing_data);

				foreach my $dna(keys(%genotyping_infos))
				{
					$snps_of_gene{$gene}{$geneposition}{$dna} = $genotyping_infos{$dna};
				}
			}
			my $snp_type = "[$ref_allele/$alt_allele]";
			$snps_of_gene2{$chromosome}{$chromosome_position} = $snp_type;
			#print $HAPMAP "$chromosome:$chromosome_position\t$snp_type\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$nb_readable_ind\t$genotyping\n";
			print $HAPMAP "$chromosome:$chromosome_position\t$ref_allele/$alt_allele\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$ratio_missing_data%\t$genotyping\n";
		}
	}
}
close($VCF);
close($HAPMAP);


if (!$reference){exit;}

#################################################################
# generate flanking sequences for Illumina VeraCode technology
#################################################################
open(my $FLANKING,">$out.flanking.txt");
foreach my $seq(keys(%ref_sequences))
{
                        if ($snps_of_gene2{$seq})
                        {
                                my $refhash = $snps_of_gene2{$seq};
                                my %hashreal = %$refhash;

                                # create consensus
                                my $refseq = $ref_sequences{$seq};
                                my $consensus = "";
                                my $previous = 0;
                                foreach my $pos(sort {$a<=>$b} keys(%hashreal))
                                {
                                        my $length = $pos - $previous - 1;
                                        $consensus .= substr($refseq,$previous,$length);
                                        my $iupac_code = $IUPAC{$snps_of_gene2{$seq}{$pos}};
                                        $consensus .= $iupac_code;
                                        $previous = $pos;
                                }
                                my $length = length($refseq) - $previous;
                                $consensus .= substr($refseq,$previous,$length);

                                foreach my $pos(sort {$a<=>$b} keys(%hashreal))
                                {
                                        my $snp_name = "$seq-$pos";
                                        my $flanking_length = 60;
                                        my $length = $flanking_length;
                                        my $start = $pos - $flanking_length - 1;
                                        if ($pos <= $flanking_length)
                                        {
                                                $length = $pos - 1;
                                                $start = 0;
                                        }
                                        my $sequence = substr($consensus,$start,$length);
				
                                        $sequence .= $snps_of_gene2{$seq}{$pos};
                                        $sequence .= substr($consensus,$pos,$flanking_length);
                                        print $FLANKING "$snp_name,$sequence,0,0,0,Project_name,0,diploid,Other,Forward\n";
                                }
                        }
	}

	close($FLANKING);

	if (!$gff){exit;}

	my @individuals_list = split(/\t/,$indiv_list);
	if ((scalar @individuals_list * scalar keys(%snps_of_gene)) > 800000)
	{
		print "Sorry, too many sequences to manage ...\n";
		exit;
	}

	open(my $ALIGN_EGGLIB,">$out.gene_alignment.fas");
	my %alignments_ind;
	foreach my $seq(keys(%snps_of_gene))
	{
		if ($snps_of_gene{$seq})
		{
			my $refhash = $snps_of_gene{$seq};
			my %hashreal = %$refhash;

			# get flanking sequences
			my %flanking5;
			my $start = $ann{$seq}{"start"};
			my $stop = $ann{$seq}{"stop"};
			my $strand = $ann{$seq}{"strand"};
			my $genelength = $stop - $start+1;
			my $chr = $chr_of_gene{$seq};
		my $refseq = substr($ref_sequences{$chr},$start-1,$genelength);
		if ($strand eq '-')
		{
			$refseq =~ tr /atcgATCG/tagcTAGC/; $refseq = reverse($refseq);
		}	
		#print "$seq $chr $start $stop $refseq \n";
		my $previous = 0;
		foreach my $pos(sort {$a<=>$b} keys(%hashreal))
		{
			my $length = $pos - $previous - 1;
			$flanking5{$pos} = substr($refseq,$previous,$length);
			$previous = $pos;
		}
		my $length = length($refseq) - $previous;
		my $flanking3 = substr($refseq,$previous,$length);
		foreach my $ind(@individuals_list)
		{
			my $nb_missing_data_for_this_individual = 0;
			if ($ind)
			{
                                                my $alignment_for_ind = "";
                                                my $seq_without_underscore = $seq;
                                                $seq_without_underscore =~s/_//g;
                                                $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_1\n";
                                                foreach my $pos(sort {$a<=>$b} keys(%hashreal))
                                                {
                                                        $alignment_for_ind .= $flanking5{$pos};
                                                        my $geno = $snps_of_gene{$seq}{$pos}{$ind};
                                                        $geno =~s/N/?/g;
                                                        if ($geno =~/\?/){$nb_missing_data_for_this_individual++;}
                                                        my @alleles = split("",$geno);
                                                        $alignment_for_ind .= $alleles[0];
                                                        if ($alleles[0] eq $alleles[1])
                                                        {
                                                                $alignments_ind{$ind} .= $alleles[1];
                                                        }
                                                        else
                                                        {
                                                                my $snp_type = "[" . $alleles[0] . "/" . $alleles[1] . "]";
                                                                $alignments_ind{$ind} .= $IUPAC{$snp_type};
                                                        }
                                                }
                                                $alignment_for_ind .= $flanking3;
						$alignment_for_ind .= "\n";
			
			
                                                $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_2\n";
                                                foreach my $pos(sort {$a<=>$b} keys(%hashreal))
                                                {
                                                        $alignment_for_ind .= $flanking5{$pos};
                                                        my $geno = $snps_of_gene{$seq}{$pos}{$ind};
                                                        $geno =~s/N/?/g;
                                                        my @alleles = split("",$geno);
                                                        $alignment_for_ind .= $alleles[1];
                                                }
                                                $alignment_for_ind .= $flanking3;
						$alignment_for_ind .= "\n";
                                                if (keys(%hashreal) != $nb_missing_data_for_this_individual)
                                                {
                                                        print $ALIGN_EGGLIB $alignment_for_ind;
                                                }
			}
		}
	}
}
close($ALIGN_EGGLIB);