Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/GenerateHeatmapFromPAV.pl @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3cbb01081cde | 1:032f6b3806a3 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 use strict; | |
| 4 use File::Basename; | |
| 5 my $dirname = dirname(__FILE__); | |
| 6 | |
| 7 my $infile = $ARGV[0]; | |
| 8 my $heatmap = $ARGV[1]; | |
| 9 | |
| 10 my %core_cluster; | |
| 11 my %samples; | |
| 12 my %sequences; | |
| 13 my %matrix; | |
| 14 my %hash; | |
| 15 my $cl_num = 0; | |
| 16 my $nb_strains = 1; | |
| 17 open(M,">$heatmap.accessory_01matrix.txt"); | |
| 18 open(PAN,">$heatmap.pangenes_01matrix.txt"); | |
| 19 open(F,"$infile"); | |
| 20 my $firstline = <F>; | |
| 21 $firstline =~s/\n//g;$firstline =~s/\r//g; | |
| 22 #my @infos = split(/","/,$firstline); | |
| 23 my @infos = split(/\t/,$firstline); | |
| 24 print M "Gene"; | |
| 25 print PAN "Gene"; | |
| 26 #for (my $j=14; $j <= $#infos; $j++){ | |
| 27 for (my $j=1; $j <= $#infos; $j++){ | |
| 28 my $gbfile = $infos[$j]; | |
| 29 $gbfile =~s/\"//g; | |
| 30 $gbfile =~s/\.gb\.filt//g; | |
| 31 | |
| 32 my $strain = $gbfile; | |
| 33 my @words = split(/_/,$strain); | |
| 34 my $genus = $words[0]; | |
| 35 my $species = $words[1]; | |
| 36 my $shortname = substr($genus,0,3) . "_". substr($species,0,2); | |
| 37 for (my $j = 2; $j <= $#words; $j++){ | |
| 38 $shortname.="_".$words[$j]; | |
| 39 } | |
| 40 #$shortname = substr($shortname,0,25); | |
| 41 | |
| 42 print M "\t".$strain; | |
| 43 print PAN "\t".$strain; | |
| 44 $samples{$j} = $strain; | |
| 45 $nb_strains++; | |
| 46 } | |
| 47 print M "\n"; | |
| 48 print PAN "\n"; | |
| 49 while(<F>){ | |
| 50 $cl_num++; | |
| 51 my $line = $_; | |
| 52 $line =~s/\n//g;$line =~s/\r//g; | |
| 53 #my @infos = split(/","/,$line); | |
| 54 my @infos = split(/\t/,$line); | |
| 55 my $concat_accessory = ""; | |
| 56 #for (my $i = 14; $i <= $#infos; $i++){ | |
| 57 for (my $i = 1; $i <= $#infos; $i++){ | |
| 58 my $val = $infos[$i]; | |
| 59 my $sample = $samples{$i}; | |
| 60 $matrix{$sample}{$cl_num} = $val; | |
| 61 $val =~s/\"//g; | |
| 62 if ($val =~/\w+/){ | |
| 63 $concat_accessory .= "\t1"; | |
| 64 $hash{$sample}{$cl_num} = 1; | |
| 65 $sequences{$sample}.= "T"; | |
| 66 } | |
| 67 else{ | |
| 68 $concat_accessory .= "\t0"; | |
| 69 $hash{$sample}{$cl_num} = 0; | |
| 70 $sequences{$sample}.= "A"; | |
| 71 } | |
| 72 } | |
| 73 if ($concat_accessory =~/0/){ | |
| 74 print M $cl_num.$concat_accessory."\n"; | |
| 75 } | |
| 76 else{ | |
| 77 $core_cluster{$cl_num}=1; | |
| 78 } | |
| 79 print PAN $cl_num.$concat_accessory."\n"; | |
| 80 #if ($cl_num > 1000){last;} | |
| 81 } | |
| 82 close(F); | |
| 83 close(M); | |
| 84 close(PAN); | |
| 85 | |
| 86 | |
| 87 system("ulimit -s 163840;Rscript $dirname/../R/heatmap.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete.pdf"); | |
| 88 #system("ulimit -s 163840;Rscript $dirname/../R/heatmaply.R -f $heatmap.accessory_01matrix.txt -o $heatmap.complete2.svg"); | |
| 89 system("pdf2svg $heatmap.complete.pdf $heatmap.complete.svg"); | |
| 90 | |
| 91 | |
| 92 open(A,">$heatmap.alignment.fa"); | |
| 93 foreach my $sample(keys(%sequences)){ | |
| 94 my $seq = $sequences{$sample}; | |
| 95 print A ">$sample\n"; | |
| 96 print A "$seq\n"; | |
| 97 } | |
| 98 close(A); | |
| 99 | |
| 100 | |
| 101 | |
| 102 my $min_y = 10000000; | |
| 103 my $min_x = 10000000; | |
| 104 my $max_y = 0; | |
| 105 my $max_x = 0; | |
| 106 my $step_x; | |
| 107 my $step_y; | |
| 108 open(SVG,"$heatmap.complete.svg"); | |
| 109 open(SVGNEW,">$heatmap.complete.new.svg"); | |
| 110 while(<SVG>){ | |
| 111 if (/rgb\(100%,0%,0%\)/){ | |
| 112 $_ =~s/rgb\(100%,0%,0%\)/rgb\(100%,100%,100%\)/g; | |
| 113 } | |
| 114 if (/rgb\(100%,63.529968%,0%\)/){ | |
| 115 $_ =~s/rgb\(100%,63.529968%,0%\)/rgb\(41%,72%,64%\)/g; | |
| 116 } | |
| 117 print SVGNEW $_; | |
| 118 } | |
| 119 close(SVG); | |
| 120 close(SVGNEW); | |
| 121 | |
| 122 | |
| 123 | |
| 124 my @clusters; | |
| 125 open(F,"$heatmap.complete.pdf.cols.csv"); | |
| 126 <F>; | |
| 127 while(<F>){ | |
| 128 my $line = $_; | |
| 129 $line =~s/\n//g;$line =~s/\r//g; | |
| 130 push(@clusters,$line); | |
| 131 print $line; | |
| 132 } | |
| 133 close(F); | |
| 134 | |
| 135 my @strains; | |
| 136 open(F,"$heatmap.complete.pdf.rows.csv"); | |
| 137 <F>; | |
| 138 while(<F>){ | |
| 139 my $line = $_; | |
| 140 $line =~s/\n//g;$line =~s/\r//g; | |
| 141 push(@strains,$line); | |
| 142 print $line; | |
| 143 } | |
| 144 close(F); | |
| 145 | |
| 146 | |
| 147 open(O,">$infile.sorted"); | |
| 148 open(HP,">$infile.sorted.for_heatmap_plotly.txt"); | |
| 149 print O "ClutserID"."\t".join("\t",@strains)."\n"; | |
| 150 print HP "ClutserID"."\t".join("\t",@strains)."\n"; | |
| 151 foreach my $cl(reverse(@clusters)){ | |
| 152 print O $cl; | |
| 153 my $clnb = $cl; | |
| 154 if (length($clnb) == 1){$clnb = "000".$clnb;} | |
| 155 elsif (length($clnb) == 2){$clnb = "00".$clnb;} | |
| 156 elsif (length($clnb) == 3){$clnb = "0".$clnb;} | |
| 157 my $name = "CLUSTER".$clnb; | |
| 158 | |
| 159 print HP $name; | |
| 160 foreach my $strain(@strains){ | |
| 161 my $val = $matrix{$strain}{$cl}; | |
| 162 print O "\t$val"; | |
| 163 if ($val =~/\w/){print HP "\t1";} | |
| 164 else{print HP "\t0";} | |
| 165 } | |
| 166 print O "\n"; | |
| 167 print HP "\n"; | |
| 168 } | |
| 169 foreach my $cl(keys(%core_cluster)){ | |
| 170 print O $cl; | |
| 171 my $clnb = $cl; | |
| 172 if (length($clnb) == 1){$clnb = "000".$clnb;} | |
| 173 elsif (length($clnb) == 2){$clnb = "00".$clnb;} | |
| 174 elsif (length($clnb) == 3){$clnb = "0".$clnb;} | |
| 175 my $name = "CLUSTER".$clnb; | |
| 176 print HP $name; | |
| 177 foreach my $strain(@strains){ | |
| 178 my $val = $matrix{$strain}{$cl}; | |
| 179 print O "\t$val"; | |
| 180 if ($val =~/\w/){print HP "\t1";} | |
| 181 else{print HP "\t0";} | |
| 182 | |
| 183 } | |
| 184 print O "\n"; | |
| 185 print HP "\n"; | |
| 186 } | |
| 187 close(O); | |
| 188 close(HP); | |
| 189 | |
| 190 my $svg_section = ""; | |
| 191 | |
| 192 if (scalar @clusters == 0){ | |
| 193 system("touch $heatmap.upsetr.svg"); | |
| 194 if (!-e "$heatmap"){ | |
| 195 system("touch $heatmap"); | |
| 196 } | |
| 197 exit; | |
| 198 } | |
| 199 system("cp -rf $infile $infile.bkp"); | |
| 200 system("cp -rf $infile.sorted $infile"); | |
| 201 | |
| 202 my %combinaisons; | |
| 203 my $x = $min_x; | |
| 204 my $length_matrix = ($max_x-$min_x); | |
| 205 my $height_matrix = ($max_y-$min_y); | |
| 206 my $step_x = $length_matrix / scalar @clusters; | |
| 207 #my $step_y = $height_matrix / scalar @strains; | |
| 208 | |
| 209 print "$height_matrix $length_matrix $step_x $step_y\n"; | |
| 210 my $previous_combinaison = ""; | |
| 211 my $combinaison_id = 0; | |
| 212 my %combinaison_ids; | |
| 213 open(M,">$heatmap.upsetr.txt"); | |
| 214 print M "ClutserID"; | |
| 215 foreach my $strain(@strains){ | |
| 216 my @words = split(/_/,$strain); | |
| 217 my $genus = $words[0]; | |
| 218 my $species = $words[1]; | |
| 219 my $shortname = substr($genus,0,3) . "_". substr($species,0,2); | |
| 220 for (my $j = 2; $j <= $#words; $j++){ | |
| 221 $shortname.="_".$words[$j]; | |
| 222 } | |
| 223 $shortname = substr($shortname,0,25); | |
| 224 print M "\t".$shortname; | |
| 225 } | |
| 226 print M "\n"; | |
| 227 foreach my $cluster(@clusters){ | |
| 228 $x = $x+$step_x; | |
| 229 my $combinaison = ""; | |
| 230 my $numstrain; | |
| 231 print M "$cluster"; | |
| 232 foreach my $strain(@strains){ | |
| 233 $numstrain++; | |
| 234 my $val = $hash{$strain}{$cluster}; | |
| 235 print M "\t".$val; | |
| 236 if ($val == 1){ | |
| 237 $combinaison .= ",$numstrain"; | |
| 238 } | |
| 239 } | |
| 240 if ($combinaison ne $previous_combinaison){ | |
| 241 $combinaison_id++; | |
| 242 } | |
| 243 print M "\n"; | |
| 244 $previous_combinaison = $combinaison; | |
| 245 $combinaison_ids{$combinaison_id}=$combinaison; | |
| 246 #print "Cluster:$cluster Combinaison: $combinaison X:$x $combinaison_id\n"; | |
| 247 $combinaisons{$combinaison_id} .= "|".$x; | |
| 248 | |
| 249 } | |
| 250 close(M); | |
| 251 | |
| 252 | |
| 253 my $nb_strains = scalar @strains; | |
| 254 system("Rscript $dirname/../R/upsetr.R $heatmap.upsetr.txt $heatmap.upsetr.pdf $nb_strains >>$heatmap.upsetr.log 2>&1"); | |
| 255 system("pdf2svg $heatmap.upsetr.pdf $heatmap.upsetr.svg 2"); | |
| 256 | |
| 257 system("python3 $dirname/../Python/Heatmap.py -i $infile.sorted.for_heatmap_plotly.txt -o $heatmap.heatmap_plotly.html >>$heatmap.heatmap_plotly.log 2>&1"); | |
| 258 | |
| 259 if (!-e "$heatmap.upsetr.svg"){ | |
| 260 system("touch $heatmap.upsetr.svg"); | |
| 261 } | |
| 262 system("perl $dirname/../Perl/reformatHeatmapSVG.pl $heatmap.complete.svg $heatmap.complete.new.svg $infile.sorted.for_heatmap_plotly.txt"); | |
| 263 system("cp -rf $heatmap.complete.new.svg $heatmap"); | |
| 264 system("gzip $heatmap"); |
