Mercurial > repos > shellac > sam_consensus_v3
diff call_consensus_from_sam_3.pl @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/call_consensus_from_sam_3.pl Mon Mar 22 18:12:50 2021 +0000 @@ -0,0 +1,517 @@ +#!/usr/bin/perl +# This calls a consensus based on the sam file and does not use qual scores etc it simply counts up each event and calls a simple most observations consensus +# the sam file must be sorted in order to cope with indels. +# usgae is ./call_consensus_from_sam_3.pl aligned.sam genome.fasta 10 +# the number 10 is the minimum mapq you want to consider + + + +open (INFILEFASTA, "$ARGV[1]"); # opens files + +open (INFILESAM, "$ARGV[0]"); +$full_len_cut = 1000; +$min_mapq = $ARGV[2]; +$full_len_count = 0; +open (OUT, ">$ARGV[0].Observed_differences_list.txt"); +open (OUTINDELS, ">$ARGV[0].Indels_applied_list.txt"); +open (OUTMINOR, ">$ARGV[0].minor_variants.txt"); +print OUTMINOR "Genome\tPosition\tA\tC\tG\tT\tDepth\n"; +open (OUTMINORB, ">$ARGV[0].dominant_minor_variant.txt"); +print OUTMINORB "Genome\tPosition\tA\tC\tG\tT\tDominat nucleotide\tFrequency\tTotal Depth\n"; +print OUT "Genome\tPosition\tReference nucleotide\tGATC Depth count\tG count\tA count\tT count\tC count\tConsensus (. means no change from reference)\tDeletions\tTotal insertions recorded\tThis insertion most popular\tThis many recorded most popular insertions\n"; +open (OUTNEWG, ">$ARGV[0].corrected_genome_snps_only.txt"); +open (OUTNEWGINDEL, ">$ARGV[0].corrected_genome_snps_and_indels.txt"); +open (OUTINDELKEY, ">$ARGV[0].key_indels.txt"); +open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt"); +open (OUTFULLSAM, ">$ARGV[0].full_len_sam.sam"); +$start_time = time; + +%for_translation = (TTT=>"F", TTC=>"F", TCT=>"S", TCC=>"S", TAT=>"Y", TAC=>"Y", TGT=>"C", TGC=>"C", TTA=>"L", TCA=>"S", TAA=>"*", TGA=>"*", TTG=>"L", TCG=>"S", TAG=>"*", TGG=>"W", CTT=>"L", CTC=>"L", CCT=>"P", CCC=>"P", CAT=>"H", CAC=>"H", CGT=>"R", CGC=>"R", CTA=>"L", CTG=>"L", CCA=>"P", CCG=>"P", CAA=>"Q", CAG=>"Q", CGA=>"R", CGG=>"R", ATT=>"I", ATC=>"I", ACT=>"T", ACC=>"T", AAT=>"N", AAC=>"N", AGT=>"S", AGC=>"S", ATA=>"I", ACA=>"T", AAA=>"K", AGA=>"R", ATG=>"M", ACG=>"T", AAG=>"K", AGG=>"R", GTT=>"V", GTC=>"V", GCT=>"A", GCC=>"A", GAT=>"D", GAC=>"D", GGT=>"G", GGC=>"G", GTA=>"V", GTG=>"V", GCA=>"A", GCG=>"A", GAA=>"E", GAG=>"E", GGA=>"G", GGG=>"G"); +%rev_translation = (GGC=>"A", ACT=>"S", TCA=>"*", ACA=>"C", TCG=>"R", GAT=>"I", GTT=>"N", GCT=>"S", GTA=>"Y", TGT=>"T", CGA=>"S", CGG=>"P", CAG=>"L", TGC=>"A", CAC=>"V", CTT=>"K", AAC=>"V", GTG=>"H", TCT=>"R", GGT=>"T", TGG=>"P", CCA=>"W", GAG=>"L", GCG=>"R", CAA=>"L", TTA=>"*", CTG=>"Q", CGT=>"T", CAT=>"M", TTT=>"K", TAC=>"V", CTA=>"*", AAG=>"L", TCC=>"G", GAC=>"V", GCA=>"C", TGA=>"S", AAT=>"I", ATA=>"Y", ATT=>"N", AGT=>"T", TTG=>"Q", GTC=>"D", ACC=>"G", GGA=>"S", AAA=>"F", CCT=>"R", ACG=>"R", CCG=>"R", ATG=>"H", TAT=>"I", GGG=>"P", CCC=>"G", TAA=>"L", CTC=>"E", TAG=>"L", ATC=>"D", AGA=>"S", GAA=>"F", CGC=>"A", GCC=>"G", AGC=>"A", TTC=>"E", AGG=>"P"); +%base_pair = (G=>"A", A=>"T", T=>"A", C=>"G"); + +print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; + + +%transcripts = (); +%orf_hash =(); +%peptides_pep_score = (); +%unique_fastas = (); +%peptides_and_utn = (); +%utn_and_peptides = (); +%fasta_head_and_peptides = (); +%indel_tab = (); + +$fasta_out =""; +$fasta_header = "Not_started_yet"; +while ($fasta_line = <INFILEFASTA>) +{ + chomp $fasta_line; + print "\n$fasta_line\n"; + if (substr($fasta_line,0,1) eq ">") { + + if ($fasta_header ne "Not_started_yet") { + #print "\n\nFasta header $fasta_header\n\n"; + @temp = split (/\s+/, $fasta_header); + $fasta_head_no_sp = $temp[0]; + #print "\n\nAfter split $fasta_head_no_sp\n\n"; + substr($fasta_head_no_sp,0,1) = ""; + $transcripts {$fasta_head_no_sp} = $fasta_out; + + #print "\n\n$fasta_head_no_sp\n\n"; + } + + + $fasta_header = $fasta_line; + + + $fasta_out =""; + } + if (substr($fasta_line,0,1) ne ">") {$fasta_out = $fasta_out.$fasta_line;} +} +($fasta_head_no_sp, $restofit) = split (/ /, $fasta_header); +substr($fasta_head_no_sp,0,1,""); +$transcripts {$fasta_head_no_sp} = $fasta_out; + +#print "\n$fasta_header\n\n$fasta_head_no_sp\n"; + +$faster_header = ""; +$chrom_proc = 0; +$chromosome = "notstartedyet"; +$old_chromosome = ""; +#exit; + + +$sam_count = 0; + +$sam_header= ""; +$transcript_count = 0; + +open (INFILESAM, "$ARGV[0]"); + +while ($sam_line = <INFILESAM>) +{ + if (substr($sam_line,0,1) eq "@") {next;} + #if (($sam_count % 10000) == 0) {print "$sam_count entries processed\n";} + #print "$sam_count\n"; + $sam_count ++; + chomp $sam_line; + @sam_cells = split (/\t/, $sam_line); + if ($sam_cells[2] eq "*") {next;} + if (($sam_cells[2] ne $chromosome) and ($chromosome ne "notstartedyet")) { + &process_data; + $chromosome = $sam_cells[2]; + $genome = $transcripts{$chromosome}; + $len_gen = length $genome; + delete $transcripts{$chromosome}; + $chrom_proc ++; + print "hi $chromosome is being processed this is chromosome number $chrom_proc\n"; + %g = (); + %a = (); + %t = (); + %c = (); + %ins = (); + %del = (); + %depth = (); + + } + # Data processed + if ($chromosome eq "notstartedyet") { + + $chromosome = $sam_cells[2]; + $genome = $transcripts{$chromosome}; + $len_gen = length $genome; + delete $transcripts{$chromosome}; + $chrom_proc ++; + print "hi $chromosome is being processed this is chromosome number $chrom_proc\n"; + %g = (); + %a = (); + %t = (); + %c = (); + %ins = (); + %del = (); + %depth = (); + + + } + + + + $utn = $sam_cells[0]; + $mapq = $sam_cells[4]; + $sequence = uc $sam_cells[9]; + $seq_len = length $sequence; + $sequence =~ tr/Uu/TT/; + #print "$sam_cells[5]\n"; + if ($min_mapq > $mapq) {next;} + #print "OK"; + $flag = $sam_cells [1]; + $genome_position = $sam_cells[3] - 1; + $fl_pos = $genome_position; + if ($genome_position < $full_len_cut) {$full_len_flag = "S"; + #if (length $sequence > 28000) {print "possible full len\n";} + + } else {$full_len_flag = "I";} + $sam_position = 0; + @cigar = split(/(M|I|D|N|S|H|P|X|=)/, $sam_cells[5]); + $array_cigar = 1; + $temp_len = 0; + while (length($cigar[$array_cigar]) >=1){ + $cigar_value = $cigar[$array_cigar - 1]; + if (($cigar[$array_cigar] eq "M") or ($cigar[$array_cigar] eq "I") or ($cigar[$array_cigar] eq "S")){$temp_len = $temp_len + $cigar_value;} + $array_cigar = $array_cigar + 2; + } + $tran_len = length $sequence; + if ($tran_len ne $temp_len) {print "cigar fail\n";next;} + $array_cigar = 1; + while (length($cigar[$array_cigar]) >=1){ + #print "cigar entry is $cigar[$array_cigar]\n"; + $cigar_value = $cigar[$array_cigar - 1]; + if ($cigar[$array_cigar] =~ /[H]/){ + #nothing to do + } + if (($cigar[$array_cigar] =~ /[S]/) and (($array_cigar + 1) < scalar (@cigar))) + { + $soft_clip = $cigar_value; + $sam_position = $sam_position + $cigar_value; + $seq_len = $seq_len - $cigar_value; + } + + + if ($cigar[$array_cigar] =~ /[M]/){ + $temp_count = 1; + #print "M"; + while ($temp_count <= $cigar_value) + { + $depth{$genome_position} ++; + $alt = substr($sequence, $sam_position, 1); + #print "alternative $alt\n"; + if ($alt eq "G") {$g{$genome_position} ++;} + if ($alt eq "A") {$a{$genome_position} ++;} + if ($alt eq "T") {$t{$genome_position} ++;} + if ($alt eq "C") {$c{$genome_position} ++;} + $temp_count ++; + $genome_position ++; + $sam_position ++; + $fl_pos ++; + } + + } + + #if (($cigar[$array_cigar] =~ /[D]/) and ($cigar_value >4)) {$cigar[$array_cigar] = "N";} + + if ($cigar[$array_cigar] =~ /[D]/){ + $temp_cv = $cigar_value - 1; + $tmp_name = "$chromosome\tDeletion\t$genome_position\t$cigar_value\t "; + if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;} + while ($temp_cv >= 0){ + if (exists $del{$genome_position + $temp_cv}) {$del{$genome_position + $temp_cv} ++;} else {$del{$genome_position + $temp_cv} = 1;} + $temp_cv --; + } + $genome_position = $genome_position + $cigar_value; + $fl_pos = $fl_pos + $cigar_value; + } + + if ($cigar[$array_cigar] =~ /[I]/){ + + $insertion = substr ($sequence, $sam_position, $cigar_value); + $tmp_name = "$chromosome\tInsertion\t$genome_position\t$cigar_value\t$insertion"; + if (exists $indel_tab{$tmp_name}){$indel_tab{$tmp_name} ++;} else {$indel_tab{$tmp_name} = 1;} + if (exists $ins{$genome_position}) {$ins{$genome_position} = $ins{$genome_position}."\t$insertion";} else {$ins{$genome_position} = $insertion;} + + $sam_position = $sam_position + $cigar_value; + + } + + if ($cigar[$array_cigar] =~ /[N]/){ + + $genome_position = $genome_position + $cigar_value; + + } + $array_cigar = $array_cigar +2; + + } + #if ($fl_pos > ($len_gen - 30)) {print "Stops\n";} + #if (($fl_pos > ($len_gen - $full_len_cut)) and ($full_len_flag eq "S")) {print OUTFULLSAM "$sam_line\n"; print "Full length found\n"; $full_len_count ++;} + if ($seq_len > ($len_gen - $full_len_cut)) {print OUTFULLSAM "$sam_line\n"; print "Full length found $seq_len X $len_gen\n"; $full_len_count ++;} + +} + + + + +#all done just need to process the last chromosome... +&process_data; + +foreach $keys (keys %transcripts){ + $genome = $transcripts{$keys}; + print OUTNEWGINDEL ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n"; + print OUTNEWG ">No_mapped_reads_to_".$keys."_genome_so_no_corrections\n$genome\n"; +} + + + +$time_elapsed = time - $start_time; + +print "Processed $sam_count entries for $chrom_proc chromosomes in $time_elapsed seconds\nFull length entries is $full_len_count\n"; + +exit; + + +sub process_data { #now to process all the data on this chromosome before moving on to the next one + $genome_position = 0; + $len_genome = length($genome); + $old_genome = $genome; + open (TEMP, ">temp.txt"); + + + while ($genome_position <= $len_genome){ + #print "at $genome_position\n"; + %ins_hash = (); + %del_hash = (); + $ref_nucleotide = substr ($genome, $genome_position, 1); + $sam_con = $ref_nucleotide; + $temp_pos = $genome_position + 1; + + if ($depth{$genome_position} > 0) { + $A = $a{$genome_position}/$depth{$genome_position}; + $C = $c{$genome_position}/$depth{$genome_position}; + $G = $g{$genome_position}/$depth{$genome_position}; + $T = $t{$genome_position}/$depth{$genome_position}; + $test = -1; + $consensus = "N"; + if ($A > $test){$test = $A; $consensus = "A";} + if ($C > $test){$test = $C; $consensus = "C";} + if ($G > $test){$test = $G; $consensus = "G";} + if ($T > $test){$test = $T; $consensus = "T";} + if ($consensus eq "A") {$A = $A * -1; $test = $A; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} + if ($consensus eq "C") {$C = $C * -1; $test = $C; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} + if ($consensus eq "G") {$G = $G * -1; $test = $G; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} + if ($consensus eq "T") {$T = $T * -1; $test = $T; print OUTMINOR "$chromosome\t$temp_pos\t$A\t$C\t$G\t$T\t$depth{$genome_position}\n";} + if ($consensus eq "N") {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n";} + $major = $test * -1; + if ($A > $test){$test = $A; $secconsensus = "A";} + if ($C > $test){$test = $C; $secconsensus = "C";} + if ($G > $test){$test = $G; $secconsensus = "G";} + if ($T > $test){$test = $T; $secconsensus = "T";} + + if ($secconsensus eq "A") {print OUTMINORB "$chromosome\t$temp_pos\t$A\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} + if ($secconsensus eq "C") {print OUTMINORB "$chromosome\t$temp_pos\t0\t$C\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} + if ($secconsensus eq "G") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t$G\t0\t$consensus\t$major\t$depth{$genome_position}\n";} + if ($secconsensus eq "T") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t$T\t$consensus\t$major\t$depth{$genome_position}\n";} + if ($secconsensus eq "N") {print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\t$consensus\t$major\t$depth{$genome_position}\n";} + + } else {print OUTMINOR "$chromosome\t$temp_pos\t0\t0\t0\t0\n"; print OUTMINORB "$chromosome\t$temp_pos\t0\t0\t0\t0\n";} + + if ($depth{$genome_position} < 3) { + if ($len_genome <100000){ + $sam_con = "."; + print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; + } + $genome_position ++; + next; + } + + $temp_most = 0; + $temp_most_ins = ""; + $ins_count = 0; + if (exists $ins{$genome_position}){ + $temp_ins = $ins{$genome_position}; + @insertions = split (/\t/, $temp_ins); + $ins_count = scalar @insertions; + foreach $key (@insertions) + { + $ins_hash{$key} ++; + # print "found $key\n"; + } + $temp_most = 0; + $temp_most_ins = ""; + foreach $key (keys %ins_hash){ + # print "$key $ins_hash{$key}\n"; + if ($ins_hash{$key} > $temp_most) {$temp_most = $ins_hash{$key}; $temp_most_ins = $key;} + } + + + + #if ($ins_count > $depth{$genome_position}) {print OUT "POSSIBLE $ins_count INSERTION AT $genome_position\t $ins{$genome_position}\n";} + if ($temp_most > ($depth{$genome_position}) ) { + print OUT "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t $ins{$genome_position}\n"; + + print TEMP "Chromosome $chromosome TOTAL $ins_count INSERTION with $depth{$genome_position} no insertions AT $temp_pos most abundant is $temp_most_ins with $temp_most instertions\t$temp_most_ins\t$genome_position\t$ins{$genome_position}\n"; + } + if ($temp_most > (($depth{$genome_position})/10)) { + $indel_proportion = $temp_most/$depth{$genome_position}; + #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; + print OUTINDELS "$chromosome\tInsertion\t$genome_position\t$ins_count\t$temp_most_ins\t$depth{$genome_position}\t$temp_most_ins\t$indel_proportion\n"; + } + + + } + + if (exists $del{$genome_position}){ + + if ($del{$genome_position} > $depth{$genome_position}) { + print OUT "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n"; + #print OUTINDELS "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos\n"; + print TEMP "Chromosome $chromosome POSSIBLE $del{$genome_position} deletion against $depth{$genome_position} no deletions found AT $temp_pos most frequent is $temp_most_dels deletion with $del{$genome_position}\t1\t$genome_position\t$del{$genome_position}\n"; + } + #if ($del_count > 10) {print OUT "POSSIBLE $del_count deletion AT $genome_position\t $del{$genome_position}\n";} + + if ($del{$genome_position} > (($depth{$genome_position})/10)) { + $indel_proportion = $del{$genome_position}/$depth{$genome_position}; + #print OUTINDELS "Chromosome\tType\tLocation\tIndel_count\tIndel_size\tRead_depth\tIndel size\tIndel proportion\n"; + print OUTINDELS "$chromosome\tDeletion\t$genome_position\t$del{$genome_position}\t$temp_most_dels\t$depth{$genome_position}\t$temp_most_dels\t$indel_proportion\n"; + } + + + } + + + + + + + $g_flag = 0; + $a_flag = 0; + $t_flag = 0; + $c_flag = 0; + $amb = "N"; + $top_nuc = 0; + $top_nucleotide = ""; + $sec_nuc = 0; + $second_nucleotide = ""; + $ref_nuc = 0; + if ($ref_nucleotide eq "G") {$ref_nuc = $g{$genome_position};} + if ($ref_nucleotide eq "A") {$ref_nuc = $a{$genome_position};} + if ($ref_nucleotide eq "T") {$ref_nuc = $t{$genome_position};} + if ($ref_nucleotide eq "C") {$ref_nuc = $c{$genome_position};} + if ($g{$genome_position} >= $top_nuc){ + $sam_con = "G"; + #$g_flag = 1; + $second_nucleotide = $top_nucleotide; + $sec_nuc = $top_nuc; + $top_nuc = $g{$genome_position}; + $top_nucleotide = "G"; + #if ($ref_nucleotide eq "G") {$amb ="G";} + } + if ($a{$genome_position} >= $top_nuc){ + $sam_con = "A"; + #$a_flag = 1; + $second_nucleotide = $top_nucleotide; + $sec_nuc = $top_nuc; + $top_nuc = $a{$genome_position}; + $top_nucleotide = "A"; + #if ($ref_nucleotide eq "A") {$amb ="A";} + } + if ($t{$genome_position} >= $top_nuc){ + $sam_con = "T"; + #$t_flag = 1; + $second_nucleotide = $top_nucleotide; + $sec_nuc = $top_nuc; + $top_nuc = $t{$genome_position}; + $top_nucleotide = "T"; + # if ($ref_nucleotide eq "T") {$amb ="T";} + } + if ($c{$genome_position} >= $top_nuc){ + $sam_con = "C"; + #$c_flag = 1; + $second_nucleotide = $top_nucleotide; + $sec_nuc = $top_nuc; + $top_nuc = $c{$genome_position}; + $top_nucleotide = "C"; + # if ($ref_nucleotide eq "C") {$amb ="C";} + } + + #print "This is G's recoded at this location $g{$genome_position}\n"; + if (($g{$genome_position} >= $a{$genome_position}) and ($g{$genome_position} >= $t{$genome_position}) and ($g{$genome_position} >= $c{$genome_position}) and ($g{$genome_position} >= 1)) { + $sam_con = "G"; $g_flag = 1; + if ($ref_nucleotide eq "G") {$amb ="G";} + } + if (($a{$genome_position} >= $g{$genome_position}) and ($a{$genome_position} >= $t{$genome_position}) and ($a{$genome_position} >= $c{$genome_position}) and ($a{$genome_position} >= 1)) { + $sam_con = "A"; $a_flag = 1; + if ($ref_nucleotide eq "A") {$amb ="A";} + } + if (($t{$genome_position} >= $g{$genome_position}) and ($t{$genome_position} >= $a{$genome_position}) and ($t{$genome_position} >= $c{$genome_position}) and ($t{$genome_position} >= 1)) { + $sam_con = "T"; $t_flag = 1; + if ($ref_nucleotide eq "T") {$amb ="T";} + } + if (($c{$genome_position} >= $g{$genome_position}) and ($c{$genome_position} >= $a{$genome_position}) and ($c{$genome_position} >= $t{$genome_position}) and ($c{$genome_position} >= 1)) { + $sam_con = "C"; $c_flag = 1; + if ($ref_nucleotide eq "C") {$amb ="C";} + } + + if (($g_flag + $a_flag + $t_flag + $c_flag) > 1) { + print OUT "ambiguity chromosome $chromosome at $temp_pos $g_flag G $a_flag A $t_flag T $c_flag C counts are G $g{$genome_position} A $a{$genome_position} T $t{$genome_position} C $c{$genome_position} \n"; + if ($amb ne "N") {$sam_con = $amb;} + } + + if ($sam_con ne $ref_nucleotide) { + #print "change\n"; + print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t$sam_con\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; + } else { + if (($len_genome <100000) or ($temp_most > $depth{$genome_position}) or ($del{$genome_position} > $depth{$genome_position})){ + print OUT "$chromosome\t$temp_pos\t$ref_nucleotide\t$depth{$genome_position}\t$g{$genome_position}\t$a{$genome_position} \t$t{$genome_position} \t$c{$genome_position}\t.\t$del{$genome_position}\t$ins_count\t$temp_most_ins\t$temp_most\n"; + } + } + + + substr ($genome, $genome_position, 1, $sam_con); + + $genome_position ++; + #print "at second $genome_position\n"; + + } + + + close TEMP; + $new_genome_w_indels = $genome; + open (TEMP, "temp.txt"); + @indels = reverse<TEMP>; + foreach $line (@indels) + { + chomp $line; + @indels_cells = split(/\t/, $line); + $change = $indels_cells[1]; + $type = $indels_cells[0]; + $genome_position = $indels_cells[2]; + if (index ($type, " INSERTION ") > 0) { + substr ($new_genome_w_indels, $genome_position, 0, $change); + } + if (index ($type, " deletion ") > 0) { + substr ($new_genome_w_indels, $genome_position, 1, ""); + } + + } + + + + print OUTNEWGINDEL ">New_".$chromosome."_genome_with_indels_is\n$new_genome_w_indels\n"; + print OUTNEWG ">New_".$chromosome."_genome_is\n$genome\n"; + open (CTSO, ">$ARGV[0].CTSO.txt"); + $warning = ""; + #open (OUTINDELKEYSIG, ">$ARGV[0].Significant_key_indels.txt"); + # open (OUTINDELKEY, ">$ARGV[0].key_indels.txt"); + # $tmp_name = "$chromosome\tDeletion\t$genome_position\t$temp_cv\t$insertion"; temp_cv is the size of the indel + print OUTINDELKEY "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n"; + print OUTINDELKEYSIG "Chromosome\tInsertion or deletion\tLocation\tIndel size\tInsertion seq\tObservation count\tDepth at this lcation\tProportion of observation vs depth\n"; + foreach $key (keys %indel_tab){ + $value = $indel_tab{$key}; + @array = split(/\t/, $key); + $genome_position = $array[2]; $indel_len = $array[3]; + if ($depth{$genome_position} > 0) {$a = ($value / $depth{$genome_position}); $tmp_proportion = sprintf ("%.2f",$a);} else {$tmp_proportion = "zero depth here";} + + print OUTINDELKEY "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; + if ($tmp_proportion > 0.1) { + + if ($indel_len > 6) {print "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n"; + print CTSO "\n\nCtSo\nInsertion or deletion\t$array[1]\nLocation\t$genome_position\nIndel size\t$indel_len\nObservation count\t$value\nDepth at this location\t$depth{$genome_position}\nProportion of observation vs depth\t$tmp_proportion\n\n"; + print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; + if (($value > 9) or ($depth{$genome_position} > 9)) {$warning = $warning."$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n";} + } + #print OUTINDELKEYSIG "$key\t$value\t$depth{$genome_position}\t$tmp_proportion\n"; + } + +} +if ($warning ne "") {print OUTINDELKEYSIG "\n\nEspecially take a moment to look at these in the above list...CtSo!\n\n$warning\n";} +} +