Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/Naegleria/mergeSNP.pl @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3cbb01081cde | 1:032f6b3806a3 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 | |
| 5 my $nb_file_found = 0; | |
| 6 open(LS,"ls *Nfowleri.snp |"); | |
| 7 while(<LS>){ | |
| 8 $nb_file_found++; | |
| 9 } | |
| 10 close(LS); | |
| 11 if ($nb_file_found < 1){ | |
| 12 print "No file found with extension \".Nfowleri.snp\". This script allows to merge multiple SNP files (named as follows \"strainname.Nfowleri.snp\") obtained with VarScan into a global VCF file.\n"; | |
| 13 exit; | |
| 14 } | |
| 15 | |
| 16 | |
| 17 print "##fileformat=VCFv4.1\n"; | |
| 18 print "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT "; | |
| 19 | |
| 20 my %corr; | |
| 21 open(F,"/media/results_datacalcul/Alexis/Naegleria/raw/SRR.info.xls"); | |
| 22 while(<F>){ | |
| 23 my $line = $_; | |
| 24 $line =~s/\n//g;$line =~s/\r//g; | |
| 25 my ($srr,$name) = split(/\t/,$line); | |
| 26 #$corr{$srr} = $name; | |
| 27 } | |
| 28 close(F); | |
| 29 | |
| 30 my %str; | |
| 31 my @strains; | |
| 32 my %hash; | |
| 33 my %genotypes; | |
| 34 my %alternates; | |
| 35 my %reference_bases; | |
| 36 open(LS,"ls *Nfowleri.snp |"); | |
| 37 #open(LS,"ls *Nlovaniensis.snp |"); | |
| 38 #open(LS,"ls ../../tuberculosis/VCF_comparaison_avec_reference/*vcf |"); | |
| 39 while(<LS>){ | |
| 40 my $file = $_; | |
| 41 $file =~s/\n//g;$file =~s/\r//g; | |
| 42 my $strain; | |
| 43 if ($file =~/^(.*).Nfowleri.snp/){ | |
| 44 #if ($file =~/^(.*).Nlovaniensis.snp/){ | |
| 45 #if ($file =~/\/([^\/]*)\.vcf/){ | |
| 46 $strain = $1; | |
| 47 $strain =~s/_/-/g; | |
| 48 #if ($strain =~/SRR12781212/ or $strain =~/SRR12781213/ or $strain =~/SRR12781214/ or $strain =~/SRR12781215/ or $strain =~/SRR12781216/){next;} | |
| 49 #if ($strain !~/NF/){next;} | |
| 50 push(@strains,$strain); | |
| 51 $str{$strain} = 1; | |
| 52 #open(F,"$strain.Klebsiella_pneumoniae_subsp._pneumoniae_HS11286.snp"); | |
| 53 open(F,$file); | |
| 54 <F>; | |
| 55 while(<F>){ | |
| 56 my $line = $_; | |
| 57 $line =~s/\n//g;$line =~s/\r//g; | |
| 58 if (/^#/){next;} | |
| 59 my ($chr,$pos,$ref,$alt,$info) = split("\t",$line); | |
| 60 my ($Cons,$Cov,$Reads1,$Reads2,$Freq) = split(":",$info); | |
| 61 $Freq =~s/%//g;$Freq =~s/,/./g; | |
| 62 $chr =~s/\.1//g; | |
| 63 $alternates{$chr}{$pos}{$alt}=1; | |
| 64 $reference_bases{$chr}{$pos} = $ref; | |
| 65 if ($Cov > 6 && $Freq > 70){ | |
| 66 $hash{$chr}{$pos}{$strain} = "1/1"; | |
| 67 } | |
| 68 elsif ($Cov > 6 && $Freq > 34){ | |
| 69 $hash{$chr}{$pos}{$strain} = "0/1"; | |
| 70 } | |
| 71 else{ | |
| 72 #$hash{$chr}{$pos}{$strain} = "0/0"; | |
| 73 } | |
| 74 } | |
| 75 close(F); | |
| 76 } | |
| 77 } | |
| 78 close(LS); | |
| 79 | |
| 80 my %depths; | |
| 81 open(LS,"ls *.depth |"); | |
| 82 while(<LS>){ | |
| 83 my $file = $_; | |
| 84 $file =~s/\n//g;$file =~s/\r//g; | |
| 85 my $strain; | |
| 86 if ($file =~/^(.*).Nfowleri.depth/){ | |
| 87 #if ($file =~/^(.*).Nlovaniensis.depth/){ | |
| 88 $strain = $1; | |
| 89 $strain =~s/_/-/g; | |
| 90 if (!$str{$strain}){next;} | |
| 91 #if ($strain !~/NF/){next;} | |
| 92 open(F,$file); | |
| 93 while(<F>){ | |
| 94 my $line = $_; | |
| 95 $line =~s/\n//g;$line =~s/\r//g; | |
| 96 my ($chr,$pos,$depth) = split("\t",$line); | |
| 97 $chr =~s/\.1//g; | |
| 98 if ($hash{$chr}{$pos}){ | |
| 99 $depths{$chr}{$pos}{$strain}=1; | |
| 100 } | |
| 101 } | |
| 102 close(F); | |
| 103 } | |
| 104 } | |
| 105 close(LS); | |
| 106 | |
| 107 foreach my $strain(@strains){ | |
| 108 if ($corr{$strain}){ | |
| 109 $strain= $corr{$strain}; | |
| 110 } | |
| 111 print "$strain\t"; | |
| 112 } | |
| 113 print "Reference\n"; | |
| 114 foreach my $chr(sort {$a<=>$b} keys(%hash)){ | |
| 115 my $refsubhash = $hash{$chr}; | |
| 116 my %subhash2 = %$refsubhash; | |
| 117 foreach my $pos(sort {$a<=>$b} keys(%subhash2)){ | |
| 118 my $refbase = $reference_bases{$chr}{$pos}; | |
| 119 my $ref_alt = $alternates{$chr}{$pos}; | |
| 120 my %subhash_alt = %$ref_alt; | |
| 121 my $alternate; | |
| 122 | |
| 123 # discard multiallelic | |
| 124 if (scalar keys(%subhash_alt) > 1){ | |
| 125 next; | |
| 126 } | |
| 127 else{ | |
| 128 foreach my $alt(keys(%subhash_alt)){ | |
| 129 $alternate = $alt; | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 print "$chr $pos . $refbase $alternate 186.96 . PASS GT:AD:DP:GQ:PL"; | |
| 134 foreach my $strain(@strains){ | |
| 135 my $val = $hash{$chr}{$pos}{$strain}; | |
| 136 if ($val){ | |
| 137 print " $val"; | |
| 138 } | |
| 139 else{ | |
| 140 if ($depths{$chr}{$pos}{$strain}){ | |
| 141 print " 0/0"; | |
| 142 } | |
| 143 else{ | |
| 144 print " ./."; | |
| 145 } | |
| 146 } | |
| 147 | |
| 148 } | |
| 149 print " 0/0\n"; | |
| 150 } | |
| 151 } | |
| 152 | |
| 153 |
