Mercurial > repos > shellac > sam_consensus_v3
comparison call_consensus_from_sam_3.pl @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 # This calls a consensus based on the sam file and does not use qual scores etc it simply counts up each event and calls a simple most observations consensus | |
| 3 # the sam file must be sorted in order to cope with indels. | |
| 4 # usgae is ./call_consensus_from_sam_3.pl aligned.sam genome.fasta 10 | |
| 5 # the number 10 is the minimum mapq you want to consider | |
| 6 | |
| 7 | |
| 8 | |
| 9 open (INFILEFASTA, "$ARGV[1]"); # opens files | |
| 10 | |
| 11 open (INFILESAM, "$ARGV[0]"); | |
| 12 $full_len_cut = 1000; | |
| 13 $min_mapq = $ARGV[2]; | |
| 14 $full_len_count = 0; | |
| 15 open (OUT, ">$ARGV[0].Observed_differences_list.txt"); | |
| 16 open (OUTINDELS, ">$ARGV[0].Indels_applied_list.txt"); | |
| 17 open (OUTMINOR, ">$ARGV[0].minor_variants.txt"); | |
| 18 print OUTMINOR "Genome\tPosition\tA\tC\tG\tT\tDepth\n"; | |
| 19 open (OUTMINORB, ">$ARGV[0].dominant_minor_variant.txt"); | |
| 20 print OUTMINORB "Genome\tPosition\tA\tC\tG\tT\tDominat nucleotide\tFrequency\tTotal Depth\n"; | |
| 21 print OUT "Genome\tPosition\tReference nucleotide\tGATC Depth count\tG count\tA count\tT count\tC count\tConsensus (. means no change from reference)\tDeletions\tTotal insertions recorded\tThis insertion most popular\tThis many recorded most popular insertions\n"; | |
| 22 open (OUTNEWG, ">$ARGV[0].corrected_genome_snps_only.txt"); | |
| 23 open (OUTNEWGINDEL, ">$ARGV[0].corrected_genome_snps_and_indels.txt"); | |
| 24 open (OUTINDELKEY, ">$ARGV[0].key_indels.txt"); | |
| 25 open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt"); | |
| 26 open (OUTFULLSAM, ">$ARGV[0].full_len_sam.sam"); | |
| 27 $start_time = time; | |
| 28 | |
| 29 %for_translation = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); | |
| 30 %rev_translation = (GGC=>"A", ACT=>"S", TCA=>"*", ACA=>"C", TCG=>"R", GAT=>"I", GTT=>"N", GCT=>"S", GTA=>"Y", TGT=>"T", CGA=>"S", CGG=>"P", CAG=>"L", TGC=>"A", CAC=>"V", CTT=>"K", AAC=>"V", GTG=>"H", TCT=>"R", GGT=>"T", TGG=>"P", CCA=>"W", GAG=>"L", GCG=>"R", CAA=>"L", TTA=>"*", CTG=>"Q", CGT=>"T", CAT=>"M", TTT=>"K", TAC=>"V", CTA=>"*", AAG=>"L", TCC=>"G", GAC=>"V", GCA=>"C", TGA=>"S", AAT=>"I", ATA=>"Y", ATT=>"N", AGT=>"T", TTG=>"Q", GTC=>"D", ACC=>"G", GGA=>"S", AAA=>"F", CCT=>"R", ACG=>"R", CCG=>"R", ATG=>"H", TAT=>"I", GGG=>"P", CCC=>"G", TAA=>"L", CTC=>"E", TAG=>"L", ATC=>"D", AGA=>"S", GAA=>"F", CGC=>"A", GCC=>"G", AGC=>"A", TTC=>"E", AGG=>"P"); | |
| 31 %base_pair = (G=>"A", A=>"T", T=>"A", C=>"G"); | |
| 32 | |
| 33 print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; | |
| 34 | |
| 35 | |
| 36 %transcripts = (); | |
| 37 %orf_hash =(); | |
| 38 %peptides_pep_score = (); | |
| 39 %unique_fastas = (); | |
| 40 %peptides_and_utn = (); | |
| 41 %utn_and_peptides = (); | |
| 42 %fasta_head_and_peptides = (); | |
| 43 %indel_tab = (); | |
| 44 | |
| 45 $fasta_out =""; | |
| 46 $fasta_header = "Not_started_yet"; | |
| 47 while ($fasta_line = <INFILEFASTA>) | |
| 48 { | |
| 49 chomp $fasta_line; | |
| 50 print "\n$fasta_line\n"; | |
| 51 if (substr($fasta_line,0,1) eq ">") { | |
| 52 | |
| 53 if ($fasta_header ne "Not_started_yet") { | |
| 54 #print "\n\nFasta header $fasta_header\n\n"; | |
| 55 @temp = split (/\s+/, $fasta_header); | |
| 56 $fasta_head_no_sp = $temp[0]; | |
| 57 #print "\n\nAfter split $fasta_head_no_sp\n\n"; | |
| 58 substr($fasta_head_no_sp,0,1) = ""; | |
| 59 $transcripts {$fasta_head_no_sp} = $fasta_out; | |
| 60 | |
| 61 #print "\n\n$fasta_head_no_sp\n\n"; | |
| 62 } | |
| 63 | |
| 64 | |
| 65 $fasta_header = $fasta_line; | |
| 66 | |
| 67 | |
| 68 $fasta_out =""; | |
| 69 } | |
| 70 if (substr($fasta_line,0,1) ne ">") {$fasta_out = $fasta_out.$fasta_line;} | |
| 71 } | |
| 72 ($fasta_head_no_sp, $restofit) = split (/ /, $fasta_header); | |
| 73 substr($fasta_head_no_sp,0,1,""); | |
| 74 $transcripts {$fasta_head_no_sp} = $fasta_out; | |
| 75 | |
| 76 #print "\n$fasta_header\n\n$fasta_head_no_sp\n"; | |
| 77 | |
| 78 $faster_header = ""; | |
| 79 $chrom_proc = 0; | |
| 80 $chromosome = "notstartedyet"; | |
| 81 $old_chromosome = ""; | |
| 82 #exit; | |
| 83 | |
| 84 | |
| 85 $sam_count = 0; | |
| 86 | |
| 87 $sam_header= ""; | |
| 88 $transcript_count = 0; | |
| 89 | |
| 90 open (INFILESAM, "$ARGV[0]"); | |
| 91 | |
| 92 while ($sam_line = <INFILESAM>) | |
| 93 { | |
| 94 if (substr($sam_line,0,1) eq "@") {next;} | |
| 95 #if (($sam_count % 10000) == 0) {print "$sam_count entries processed\n";} | |
| 96 #print "$sam_count\n"; | |
| 97 $sam_count ++; | |
| 98 chomp $sam_line; | |
| 99 @sam_cells = split (/\t/, $sam_line); | |
| 100 if ($sam_cells[2] eq "*") {next;} | |
| 101 if (($sam_cells[2] ne $chromosome) and ($chromosome ne "notstartedyet")) { | |
| 102 &process_data; | |
| 103 $chromosome = $sam_cells[2]; | |
| 104 $genome = $transcripts{$chromosome}; | |
| 105 $len_gen = length $genome; | |
| 106 delete $transcripts{$chromosome}; | |
| 107 $chrom_proc ++; | |
| 108 print "hi $chromosome is being processed this is chromosome number $chrom_proc\n"; | |
| 109 %g = (); | |
| 110 %a = (); | |
| 111 %t = (); | |
| 112 %c = (); | |
| 113 %ins = (); | |
| 114 %del = (); | |
| 115 %depth = (); | |
| 116 | |
| 117 } | |
| 118 # Data processed | |
| 119 if ($chromosome eq "notstartedyet") { | |
| 120 | |
| 121 $chromosome = $sam_cells[2]; | |
| 122 $genome = $transcripts{$chromosome}; | |
| 123 $len_gen = length $genome; | |
| 124 delete $transcripts{$chromosome}; | |
| 125 $chrom_proc ++; | |
| 126 print "hi $chromosome is being processed this is chromosome number $chrom_proc\n"; | |
| 127 %g = (); | |
| 128 %a = (); | |
| 129 %t = (); | |
| 130 %c = (); | |
| 131 %ins = (); | |
| 132 %del = (); | |
| 133 %depth = (); | |
| 134 | |
| 135 | |
| 136 } | |
| 137 | |
| 138 | |
| 139 | |
| 140 $utn = $sam_cells[0]; | |
| 141 $mapq = $sam_cells[4]; | |
| 142 $sequence = uc $sam_cells[9]; | |
| 143 $seq_len = length $sequence; | |
| 144 $sequence =~ tr/Uu/TT/; | |
| 145 #print "$sam_cells[5]\n"; | |
| 146 if ($min_mapq > $mapq) {next;} | |
| 147 #print "OK"; | |
| 148 $flag = $sam_cells [1]; | |
| 149 $genome_position = $sam_cells[3] - 1; | |
| 150 $fl_pos = $genome_position; | |
| 151 if ($genome_position < $full_len_cut) {$full_len_flag = "S"; | |
| 152 #if (length $sequence > 28000) {print "possible full len\n";} | |
| 153 | |
| 154 } else {$full_len_flag = "I";} | |
| 155 $sam_position = 0; | |
| 156 @cigar = split(/(M|I|D|N|S|H|P|X|=)/, $sam_cells[5]); | |
| 157 $array_cigar = 1; | |
| 158 $temp_len = 0; | |
| 159 while (length($cigar[$array_cigar]) >=1){ | |
| 160 $cigar_value = $cigar[$array_cigar - 1]; | |
| 161 if (($cigar[$array_cigar] eq "M") or ($cigar[$array_cigar] eq "I") or ($cigar[$array_cigar] eq "S")){$temp_len = $temp_len + $cigar_value;} | |
| 162 $array_cigar = $array_cigar + 2; | |
| 163 } | |
| 164 $tran_len = length $sequence; | |
| 165 if ($tran_len ne $temp_len) {print "cigar fail\n";next;} | |
| 166 $array_cigar = 1; | |
| 167 while (length($cigar[$array_cigar]) >=1){ | |
| 168 #print "cigar entry is $cigar[$array_cigar]\n"; | |
| 169 $cigar_value = $cigar[$array_cigar - 1]; | |
| 170 if ($cigar[$array_cigar] =~ /[H]/){ | |
| 171 #nothing to do | |
| 172 } | |
| 173 if (($cigar[$array_cigar] =~ /[S]/) and (($array_cigar + 1) < scalar (@cigar))) | |
| 174 { | |
| 175 $soft_clip = $cigar_value; | |
| 176 $sam_position = $sam_position + $cigar_value; | |
| 177 $seq_len = $seq_len - $cigar_value; | |
| 178 } | |
| 179 | |
| 180 | |
| 181 if ($cigar[$array_cigar] =~ /[M]/){ | |
| 182 $temp_count = 1; | |
| 183 #print "M"; | |
| 184 while ($temp_count <= $cigar_value) | |
| 185 { | |
| 186 $depth{$genome_position} ++; | |
| 187 $alt = substr($sequence, $sam_position, 1); | |
| 188 #print "alternative $alt\n"; | |
| 189 if ($alt eq "G") {$g{$genome_position} ++;} | |
| 190 if ($alt eq "A") {$a{$genome_position} ++;} | |
| 191 if ($alt eq "T") {$t{$genome_position} ++;} | |
| 192 if ($alt eq "C") {$c{$genome_position} ++;} | |
| 193 $temp_count ++; | |
| 194 $genome_position ++; | |
| 195 $sam_position ++; | |
| 196 $fl_pos ++; | |
| 197 } | |
| 198 | |
| 199 } | |
| 200 | |
| 201 #if (($cigar[$array_cigar] =~ /[D]/) and ($cigar_value >4)) {$cigar[$array_cigar] = "N";} | |
| 202 | |
| 203 if ($cigar[$array_cigar] =~ /[D]/){ | |
| 204 $temp_cv = $cigar_value - 1; | |
| 205 $tmp_name = "$chromosome\tDeletion\t$genome_position\t$cigar_value\t "; | |
| 206 if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;} | |
| 207 while ($temp_cv >= 0){ | |
| 208 if (exists $del{$genome_position + $temp_cv}) {$del{$genome_position + $temp_cv} ++;} else {$del{$genome_position + $temp_cv} = 1;} | |
| 209 $temp_cv --; | |
| 210 } | |
| 211 $genome_position = $genome_position + $cigar_value; | |
| 212 $fl_pos = $fl_pos + $cigar_value; | |
| 213 } | |
| 214 | |
| 215 if ($cigar[$array_cigar] =~ /[I]/){ | |
| 216 | |
| 217 $insertion = substr ($sequence, $sam_position, $cigar_value); | |
| 218 $tmp_name = "$chromosome\tInsertion\t$genome_position\t$cigar_value\t$insertion"; | |
| 219 if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;} | |
| 220 if (exists $ins{$genome_position}) {$ins{$genome_position} = $ins{$genome_position}."\t$insertion";} else {$ins{$genome_position} = $insertion;} | |
| 221 | |
| 222 $sam_position = $sam_position + $cigar_value; | |
| 223 | |
| 224 } | |
| 225 | |
| 226 if ($cigar[$array_cigar] =~ /[N]/){ | |
| 227 | |
| 228 $genome_position = $genome_position + $cigar_value; | |
| 229 | |
| 230 } | |
| 231 $array_cigar = $array_cigar +2; | |
| 232 | |
| 233 } | |
| 234 #if ($fl_pos > ($len_gen - 30)) {print "Stops\n";} | |
| 235 #if (($fl_pos > ($len_gen - $full_len_cut)) and ($full_len_flag eq "S")) {print OUTFULLSAM "$sam_line\n"; print "Full length found\n"; $full_len_count ++;} | |
| 236 if ($seq_len > ($len_gen - $full_len_cut)) {print OUTFULLSAM "$sam_line\n"; print "Full length found $seq_len X $len_gen\n"; $full_len_count ++;} | |
| 237 | |
| 238 } | |
| 239 | |
| 240 | |
| 241 | |
| 242 | |
| 243 #all done just need to process the last chromosome... | |
| 244 &process_data; | |
| 245 | |
| 246 foreach $keys (keys %transcripts){ | |
| 247 $genome = $transcripts{$keys}; | |
| 248 print OUTNEWGINDEL ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n"; | |
| 249 print OUTNEWG ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n"; | |
| 250 } | |
| 251 | |
| 252 | |
| 253 | |
| 254 $time_elapsed = time - $start_time; | |
| 255 | |
| 256 print "Processed $sam_count entries for $chrom_proc chromosomes in $time_elapsed seconds\nFull length entries is $full_len_count\n"; | |
| 257 | |
| 258 exit; | |
| 259 | |
| 260 | |
| 261 sub process_data { #now to process all the data on this chromosome before moving on to the next one | |
| 262 $genome_position = 0; | |
| 263 $len_genome = length($genome); | |
| 264 $old_genome = $genome; | |
| 265 open (TEMP, ">temp.txt"); | |
| 266 | |
| 267 | |
| 268 while ($genome_position <= $len_genome){ | |
| 269 #print "at $genome_position\n"; | |
| 270 %ins_hash = (); | |
| 271 %del_hash = (); | |
| 272 $ref_nucleotide = substr ($genome, $genome_position, 1); | |
| 273 $sam_con = $ref_nucleotide; | |
| 274 $temp_pos = $genome_position + 1; | |
| 275 | |
| 276 if ($depth{$genome_position} > 0) { | |
| 277 $A = $a{$genome_position}/$depth{$genome_position}; | |
| 278 $C = $c{$genome_position}/$depth{$genome_position}; | |
| 279 $G = $g{$genome_position}/$depth{$genome_position}; | |
| 280 $T = $t{$genome_position}/$depth{$genome_position}; | |
| 281 $test = -1; | |
| 282 $consensus = "N"; | |
| 283 if ($A > $test){$test = $A; $consensus = "A";} | |
| 284 if ($C > $test){$test = $C; $consensus = "C";} | |
| 285 if ($G > $test){$test = $G; $consensus = "G";} | |
| 286 if ($T > $test){$test = $T; $consensus = "T";} | |
| 287 if ($consensus eq "A") {$A = $A * -1; $test = $A; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} | |
| 288 if ($consensus eq "C") {$C = $C * -1; $test = $C; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} | |
| 289 if ($consensus eq "G") {$G = $G * -1; $test = $G; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} | |
| 290 if ($consensus eq "T") {$T = $T * -1; $test = $T; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} | |
| 291 if ($consensus eq "N") {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n";} | |
| 292 $major = $test * -1; | |
| 293 if ($A > $test){$test = $A; $secconsensus = "A";} | |
| 294 if ($C > $test){$test = $C; $secconsensus = "C";} | |
| 295 if ($G > $test){$test = $G; $secconsensus = "G";} | |
| 296 if ($T > $test){$test = $T; $secconsensus = "T";} | |
| 297 | |
| 298 if ($secconsensus eq "A") {print OUTMINORB "$chromosome\t$temp_pos\t$A\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} | |
| 299 if ($secconsensus eq "C") {print OUTMINORB "$chromosome\t$temp_pos\t0\t$C\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} | |
| 300 if ($secconsensus eq "G") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t$G\t0\t$consensus\t$major\t$depth{$genome_position}\n";} | |
| 301 if ($secconsensus eq "T") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t$T\t$consensus\t$major\t$depth{$genome_position}\n";} | |
| 302 if ($secconsensus eq "N") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} | |
| 303 | |
| 304 } else {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n"; print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\n";} | |
| 305 | |
| 306 if ($depth{$genome_position} < 3) { | |
| 307 if ($len_genome <100000){ | |
| 308 $sam_con = "."; | |
| 309 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; | |
| 310 } | |
| 311 $genome_position ++; | |
| 312 next; | |
| 313 } | |
| 314 | |
| 315 $temp_most = 0; | |
| 316 $temp_most_ins = ""; | |
| 317 $ins_count = 0; | |
| 318 if (exists $ins{$genome_position}){ | |
| 319 $temp_ins = $ins{$genome_position}; | |
| 320 @insertions = split (/\t/, $temp_ins); | |
| 321 $ins_count = scalar @insertions; | |
| 322 foreach $key (@insertions) | |
| 323 { | |
| 324 $ins_hash{$key} ++; | |
| 325 # print "found $key\n"; | |
| 326 } | |
| 327 $temp_most = 0; | |
| 328 $temp_most_ins = ""; | |
| 329 foreach $key (keys %ins_hash){ | |
| 330 # print "$key $ins_hash{$key}\n"; | |
| 331 if ($ins_hash{$key} > $temp_most) {$temp_most = $ins_hash{$key}; $temp_most_ins = $key;} | |
| 332 } | |
| 333 | |
| 334 | |
| 335 | |
| 336 #if ($ins_count > $depth{$genome_position}) {print OUT "POSSIBLE $ins_count INSERTION AT $genome_position\t $ins{$genome_position}\n";} | |
| 337 if ($temp_most > ($depth{$genome_position}) ) { | |
| 338 print OUT "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t $ins{$genome_position}\n"; | |
| 339 | |
| 340 print TEMP "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t$temp_most_ins\t$genome_position\t$ins{$genome_position}\n"; | |
| 341 } | |
| 342 if ($temp_most > (($depth{$genome_position})/10)) { | |
| 343 $indel_proportion = $temp_most/$depth{$genome_position}; | |
| 344 #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; | |
| 345 print OUTINDELS "$chromosome\tInsertion\t$genome_position\t$ins_count\t$temp_most_ins\t$depth{$genome_position}\t$temp_most_ins\t$indel_proportion\n"; | |
| 346 } | |
| 347 | |
| 348 | |
| 349 } | |
| 350 | |
| 351 if (exists $del{$genome_position}){ | |
| 352 | |
| 353 if ($del{$genome_position} > $depth{$genome_position}) { | |
| 354 print OUT "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n"; | |
| 355 #print OUTINDELS "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n"; | |
| 356 print TEMP "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos most frequent is $temp_most_dels deletion with $del{$genome_position}\t1\t$genome_position\t$del{$genome_position}\n"; | |
| 357 } | |
| 358 #if ($del_count > 10) {print OUT "POSSIBLE $del_count deletion AT $genome_position\t $del{$genome_position}\n";} | |
| 359 | |
| 360 if ($del{$genome_position} > (($depth{$genome_position})/10)) { | |
| 361 $indel_proportion = $del{$genome_position}/$depth{$genome_position}; | |
| 362 #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; | |
| 363 print OUTINDELS "$chromosome\tDeletion\t$genome_position\t$del{$genome_position}\t$temp_most_dels\t$depth{$genome_position}\t$temp_most_dels\t$indel_proportion\n"; | |
| 364 } | |
| 365 | |
| 366 | |
| 367 } | |
| 368 | |
| 369 | |
| 370 | |
| 371 | |
| 372 | |
| 373 | |
| 374 $g_flag = 0; | |
| 375 $a_flag = 0; | |
| 376 $t_flag = 0; | |
| 377 $c_flag = 0; | |
| 378 $amb = "N"; | |
| 379 $top_nuc = 0; | |
| 380 $top_nucleotide = ""; | |
| 381 $sec_nuc = 0; | |
| 382 $second_nucleotide = ""; | |
| 383 $ref_nuc = 0; | |
| 384 if ($ref_nucleotide eq "G") {$ref_nuc = $g{$genome_position};} | |
| 385 if ($ref_nucleotide eq "A") {$ref_nuc = $a{$genome_position};} | |
| 386 if ($ref_nucleotide eq "T") {$ref_nuc = $t{$genome_position};} | |
| 387 if ($ref_nucleotide eq "C") {$ref_nuc = $c{$genome_position};} | |
| 388 if ($g{$genome_position} >= $top_nuc){ | |
| 389 $sam_con = "G"; | |
| 390 #$g_flag = 1; | |
| 391 $second_nucleotide = $top_nucleotide; | |
| 392 $sec_nuc = $top_nuc; | |
| 393 $top_nuc = $g{$genome_position}; | |
| 394 $top_nucleotide = "G"; | |
| 395 #if ($ref_nucleotide eq "G") {$amb ="G";} | |
| 396 } | |
| 397 if ($a{$genome_position} >= $top_nuc){ | |
| 398 $sam_con = "A"; | |
| 399 #$a_flag = 1; | |
| 400 $second_nucleotide = $top_nucleotide; | |
| 401 $sec_nuc = $top_nuc; | |
| 402 $top_nuc = $a{$genome_position}; | |
| 403 $top_nucleotide = "A"; | |
| 404 #if ($ref_nucleotide eq "A") {$amb ="A";} | |
| 405 } | |
| 406 if ($t{$genome_position} >= $top_nuc){ | |
| 407 $sam_con = "T"; | |
| 408 #$t_flag = 1; | |
| 409 $second_nucleotide = $top_nucleotide; | |
| 410 $sec_nuc = $top_nuc; | |
| 411 $top_nuc = $t{$genome_position}; | |
| 412 $top_nucleotide = "T"; | |
| 413 # if ($ref_nucleotide eq "T") {$amb ="T";} | |
| 414 } | |
| 415 if ($c{$genome_position} >= $top_nuc){ | |
| 416 $sam_con = "C"; | |
| 417 #$c_flag = 1; | |
| 418 $second_nucleotide = $top_nucleotide; | |
| 419 $sec_nuc = $top_nuc; | |
| 420 $top_nuc = $c{$genome_position}; | |
| 421 $top_nucleotide = "C"; | |
| 422 # if ($ref_nucleotide eq "C") {$amb ="C";} | |
| 423 } | |
| 424 | |
| 425 #print "This is G's recoded at this location $g{$genome_position}\n"; | |
| 426 if (($g{$genome_position} >= $a{$genome_position}) and ($g{$genome_position} >= $t{$genome_position}) and ($g{$genome_position} >= $c{$genome_position}) and ($g{$genome_position} >= 1)) { | |
| 427 $sam_con = "G"; $g_flag = 1; | |
| 428 if ($ref_nucleotide eq "G") {$amb ="G";} | |
| 429 } | |
| 430 if (($a{$genome_position} >= $g{$genome_position}) and ($a{$genome_position} >= $t{$genome_position}) and ($a{$genome_position} >= $c{$genome_position}) and ($a{$genome_position} >= 1)) { | |
| 431 $sam_con = "A"; $a_flag = 1; | |
| 432 if ($ref_nucleotide eq "A") {$amb ="A";} | |
| 433 } | |
| 434 if (($t{$genome_position} >= $g{$genome_position}) and ($t{$genome_position} >= $a{$genome_position}) and ($t{$genome_position} >= $c{$genome_position}) and ($t{$genome_position} >= 1)) { | |
| 435 $sam_con = "T"; $t_flag = 1; | |
| 436 if ($ref_nucleotide eq "T") {$amb ="T";} | |
| 437 } | |
| 438 if (($c{$genome_position} >= $g{$genome_position}) and ($c{$genome_position} >= $a{$genome_position}) and ($c{$genome_position} >= $t{$genome_position}) and ($c{$genome_position} >= 1)) { | |
| 439 $sam_con = "C"; $c_flag = 1; | |
| 440 if ($ref_nucleotide eq "C") {$amb ="C";} | |
| 441 } | |
| 442 | |
| 443 if (($g_flag + $a_flag + $t_flag + $c_flag) > 1) { | |
| 444 print OUT "ambiguity chromosome $chromosome at $temp_pos $g_flag G $a_flag A $t_flag T $c_flag C counts are G $g{$genome_position} A $a{$genome_position} T $t{$genome_position} C $c{$genome_position} \n"; | |
| 445 if ($amb ne "N") {$sam_con = $amb;} | |
| 446 } | |
| 447 | |
| 448 if ($sam_con ne $ref_nucleotide) { | |
| 449 #print "change\n"; | |
| 450 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; | |
| 451 } else { | |
| 452 if (($len_genome <100000) or ($temp_most > $depth{$genome_position}) or ($del{$genome_position} > $depth{$genome_position})){ | |
| 453 print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t.\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; | |
| 454 } | |
| 455 } | |
| 456 | |
| 457 | |
| 458 substr ($genome, $genome_position, 1, $sam_con); | |
| 459 | |
| 460 $genome_position ++; | |
| 461 #print "at second $genome_position\n"; | |
| 462 | |
| 463 } | |
| 464 | |
| 465 | |
| 466 close TEMP; | |
| 467 $new_genome_w_indels = $genome; | |
| 468 open (TEMP, "temp.txt"); | |
| 469 @indels = reverse<TEMP>; | |
| 470 foreach $line (@indels) | |
| 471 { | |
| 472 chomp $line; | |
| 473 @indels_cells = split(/\t/, $line); | |
| 474 $change = $indels_cells[1]; | |
| 475 $type = $indels_cells[0]; | |
| 476 $genome_position = $indels_cells[2]; | |
| 477 if (index ($type, " INSERTION ") > 0) { | |
| 478 substr ($new_genome_w_indels, $genome_position, 0, $change); | |
| 479 } | |
| 480 if (index ($type, " deletion ") > 0) { | |
| 481 substr ($new_genome_w_indels, $genome_position, 1, ""); | |
| 482 } | |
| 483 | |
| 484 } | |
| 485 | |
| 486 | |
| 487 | |
| 488 print OUTNEWGINDEL ">New_".$chromosome."_genome_with_indels_is\n$new_genome_w_indels\n"; | |
| 489 print OUTNEWG ">New_".$chromosome."_genome_is\n$genome\n"; | |
| 490 open (CTSO, ">$ARGV[0].CTSO.txt"); | |
| 491 $warning = ""; | |
| 492 #open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt"); | |
| 493 # open (OUTINDELKEY, ">$ARGV[0].key_indels.txt"); | |
| 494 # $tmp_name = "$chromosome\tDeletion\t$genome_position\t$temp_cv\t$insertion"; temp_cv is the size of the indel | |
| 495 print OUTINDELKEY "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n"; | |
| 496 print OUTINDELKEYSIG "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n"; | |
| 497 foreach $key (keys %indel_tab){ | |
| 498 $value = $indel_tab{$key}; | |
| 499 @array = split(/\t/, $key); | |
| 500 $genome_position = $array[2]; $indel_len = $array[3]; | |
| 501 if ($depth{$genome_position} > 0) {$a = ($value / $depth{$genome_position}); $tmp_proportion = sprintf ("%.2f",$a);} else {$tmp_proportion = "zero depth here";} | |
| 502 | |
| 503 print OUTINDELKEY "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; | |
| 504 if ($tmp_proportion > 0.1) { | |
| 505 | |
| 506 if ($indel_len > 6) {print "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n"; | |
| 507 print CTSO "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n"; | |
| 508 print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; | |
| 509 if (($value > 9) or ($depth{$genome_position} > 9)) {$warning = $warning."$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";} | |
| 510 } | |
| 511 #print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; | |
| 512 } | |
| 513 | |
| 514 } | |
| 515 if ($warning ne "") {print OUTINDELKEYSIG "\n\nEspecially take a moment to look at these in the above list...CtSo!\n\n$warning\n";} | |
| 516 } | |
| 517 |
