Mercurial > repos > xuebing > sharplabtool
view tools/regVariation/microsatellite_birthdeath.pl @ 2:c2a356708570
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:42 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
#!/usr/bin/perl -w use strict; use warnings; use Term::ANSIColor; use Pod::Checker; use File::Basename; use IO::Handle; use Cwd; use File::Path qw(make_path remove_tree); use File::Temp qw/ tempfile tempdir /; my $tdir = tempdir( CLEANUP => 0 ); chdir $tdir; my $dir = getcwd; #print "current dit=$dir\n"; use vars qw (%treesToReject %template $printer $interr_poscord $interrcord $no_of_interruptionscord $stringfile @tags $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord %thresholdhash $tree_decipherer @sp_ident %revHash %sameHash %treesToIgnore %alternate @exactspecies @exacttags); use FileHandle; use IO::Handle; # 5.004 or higher #my @ar = ("/Users/ydk/work/rhesus_microsat/results/galay/chr22_5sp.maf.txt", "/Users/ydk/work/rhesus_microsat/results/galay/dataset_11.dat", #"/Users/ydk/work/rhesus_microsat/results/galay/chr22_5spec.maf.summ","hg18,panTro2,ponAbe2,rheMac2,calJac1","((((hg18, panTro2), ponAbe2), rheMac2), calJac1)","9,10,12,12", #"10","0.8"); my @ar = @ARGV; my ($maf, $orth, $summout, $species_set, $tree_definition, $thresholds, $FLANK_SUPPORT, $SIMILARITY_THRESH) = @ar; $SIMILARITY_THRESH=$SIMILARITY_THRESH/100; ######################### $SIMILARITY_THRESH = $SIMILARITY_THRESH/100; my $EDGE_DISTANCE = 10; my $COMPLEXITY_SUPPORT = 20; load_thresholds("9_10_12_12"); ######################### my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #print "complexity=$complexity\n"; #<STDIN>; #$printer = 1; my $rando = int(rand(1000)); my $localdate = `date`; $localdate =~ /([0-9]+):([0-9]+):([0-9]+)/; my $info = $rando.$1.$2.$3; #--------------------------------------------------------------------------- # GETTING INPUT INFORMATION AND OPENING INPUT AND OUTPUT FILES my @thresharr = (0, split(/,/,$thresholds)); my $randno=int(rand(100000)); my $megamatch = $randno.".megamatch.net.axt"; #"/gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr1.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt"; my $megamatchlck = $megamatch.".lck"; unlink $megamatchlck; #my $selected= $orth; #my $eventfile = $orth; #$selected = $selected."_SELECTED"; #$selected = $selected."_".$SIMILARITY_THRESH; #my $runtime = $selected.".runtime"; my $inputtags = "H:C:O:R:M"; $inputtags = $ARGV[3] if exists $ARGV[3] && $ARGV[3] =~ /[A-Z]:[A-Z]/; my @all_tags = split(/:/, $inputtags); my $inputsp = "hg18:panTro2:ponAbe2:rheMac2:calJac1"; $inputsp = $ARGV[4] if exists $ARGV[4] && $ARGV[3] =~ /[0-9]+:/; @sp_ident = split(/:/,$inputsp); my $junkfile = $orth."_junk"; my $sh = load_sameHash(1); my $rh = load_revHash(1); #print "inputs are : \n"; foreach(@ARGV){print $_,"\n";} #open (SELECT, ">$selected") or die "Cannot open selected file: $selected: $!"; open (SUMMARY, ">$summout") or die "Cannot open summout file: $summout: $!"; #open (RUN, ">$runtime") or die "Cannot open orth file: $runtime: $!"; #my $ctlfile = "baseml\.ctl"; #$ARGV[4]; #my $treefile = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/"; #1 THIS IS THE THE TREE UNDER CONSIDERATION, IN NEWICK my %registeredTrees = (); my @removalReasons = ("microsatellite is compound", "complex structure", "if no. if micros is more than no. of species", "if more than one micro per species ", "if microsat contains N", "different motif than required ", "more than zero interruptions", "microsat could not form key ", "orthologous microsats of different motif size ", "orthologous microsats of different motifs ", "microsats belong to different alignment blocks altogether", "microsat near edge", "microsat in low complexity region", "microsat flanks dont align well", "phylogeny not informative"); my %allowedhash=(); #--------------------------------------------------------------------------- # WORKING ON MAKING THE MEGAMATCH FILE my $chromt=int(rand(10000)); my $p_chr=$chromt; $tree_definition=~s/,/, /g; $tree_definition =~ s/, +/, /g; my @exactspeciesset_unarranged = split(/,/,$species_set); my $largesttree = "$tree_definition;"; $tree_definition=~s/[\)\(, ]/\t/g; my @treespecies=split(/\t+/,$tree_definition); foreach my $spec (@treespecies){ foreach my $espec (@exactspeciesset_unarranged){ push @exactspecies, $spec if $spec eq $espec; } } #print "exactspecies=@exactspecies\n"; my $focalspec = $exactspecies[0]; my $arranged_species_set=join(".",@exactspecies); @exacttags=@exactspecies; foreach my $extag (@exacttags){ $extag =~ s/hg18/H/g; $extag =~ s/panTro2/C/g; $extag =~ s/ponAbe2/O/g; $extag =~ s/rheMac2/R/g; $extag =~ s/calJac1/M/g; } my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); #print "sending to maftoAxt_multispecies: $maf, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; maftoAxt_multispecies($maf, $tree_definition, $chr_name, $species_set); my @filterseqfiles= ($chr_name); $largesttree =~ s/hg18/H/g; $largesttree =~ s/panTro2/C/g; $largesttree =~ s/ponAbe2/O/g; $largesttree =~ s/rheMac2/R/g; $largesttree =~ s/calJac1/M/g; #--------------------------------------------------------------------------- my ($lagestnodes, $largestbranches) = get_nodes($largesttree); shift (@$lagestnodes); my @extendedtitle=(); my $title = (); my $parttitle = (); my @titlearr = (); my @firsttitle=($focalspec."chrom", $focalspec."start", $focalspec."end", $focalspec."motif", $focalspec."motifsize", $focalspec."threshold"); my @finames= qw(chr start end motif motifsize microsat mutation mutation.position mutation.from mutation.to insertion.details deletion.details); my @fititle=(); foreach my $spec (split(",",$species_set)){ push @fititle, $spec; foreach my $name (@finames){ push @fititle, $spec.".".$name; } } my @othertitle=qw(somechr somestart somened event source); my @fnames = (); push @fnames, qw(insertions_num deletions_num motinsertions_num motinsertionsf_num motdeletions_num motdeletionsf_num noninsertions_num nondeletions_num) ; push @fnames, qw(binsertions_num bdeletions_num bmotinsertions_num bmotinsertionsf_num bmotdeletions_num bmotdeletionsf_num bnoninsertions_num bnondeletions_num) ; push @fnames, qw(dinsertions_num ddeletions_num dmotinsertions_num dmotinsertionsf_num dmotdeletions_num dmotdeletionsf_num dnoninsertions_num dnondeletions_num) ; push @fnames, qw(ninsertions_num ndeletions_num nmotinsertions_num nmotinsertionsf_num nmotdeletions_num nmotdeletionsf_num nnoninsertions_num nnondeletions_num) ; push @fnames, qw(substitutions_num bsubstitutions_num dsubstitutions_num nsubstitutions_num indels_num subs_num); my @fullnames = (); foreach my $lnode (@$lagestnodes){ my @pair = @$lnode; my @nodemutarr = (); for my $p (@pair){ # print "p = $p\n"; $p =~ s/[\(\), ]+//g; $p =~ s/H/hg18/g; $p =~ s/C/panTro2/g; $p =~ s/O/ponAbe2/g; $p =~ s/R/rheMac2/g; $p =~ s/M/calJac1/g; foreach my $n (@fnames) { push @fullnames, $p.".".$n;} } } print SUMMARY "#",join("\t", @firsttitle, @fititle, @othertitle); print SUMMARY "\t",join("\t", @fullnames); #$title = $title."\t".join("\t", @fullnames); print SUMMARY "\t",join("\t", @fnames); #$title= $title."\t".join("\t", @fnames); print SUMMARY "\t","tree","\t", "cleancase", "\n"; #$title= $title."\t"."tree"."\t"."cleancase". "\n"; #print $title; #<STDIN>; #print "all_tags = @all_tags\n"; for my $no (3 ... $#all_tags+1){ # print "no=$no\n"; #<STDIN>; @tags = @all_tags[0 ... $no-1]; #print "tags = = @tags\n" if $printer == 1; %template=(); my @nextcounter = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); #next if scalar(@tags) < 4; #print "now doing tags = @tags, no = $no\n"; open (ORTH, "<$orth") or die "Cannot open orth file: $orth: $!"; # print SUMMARY join "\t", qw (species chr start end branch motif microsat mutation position from to insertion deletion); ##################### T E M P O R A R Y ##################### my @finaltitle=(); my @singletitle = qw (species chr start end motif motifsize microsat strand microsatsize col10 col11 col12 col13); my $endtitle = (); foreach my $tag (@tags){ my @tempsingle = (); foreach my $single (@singletitle){ push @tempsingle, $tag.$single; } @finaltitle = (@finaltitle, @tempsingle); } # print SUMMARY join("\t",@finaltitle),"\n"; ############################################################# #--------------------------------------------------------------------------- # GET THE TREE FROM TREE FILE my $tree = (); $tree = "((H, C), O)" if $no == 3; $tree = "(((H, C), O), R)" if $no == 4; $tree = "((((H, C), O), R), M)" if $no == 5; # $tree=~s/;$//g; # print "our tree = $tree\n"; #--------------------------------------------------------------------------- # LOADING HASH CONTAINING ALL POSSIBLE TREES: $tree_decipherer = "/gpfs/home/ydk104/work/rhesus_microsat/codes/lib/tree_analysis_".join("",@tags).".txt"; load_allPossibleTrees($tree_decipherer, \%template, \%alternate); #--------------------------------------------------------------------------- # LOADING THE TREES TO REJECT FOR BIRTH ANALYSIS %treesToReject=(); %treesToIgnore=(); load_treesToReject(@tags); load_treesToIgnore(@tags); #--------------------------------------------------------------------------- # LOADING INPUT DATA INTO HASHES AND ARRAYS #1 THIS IS THE POINT WHERE WE CAN FILTER OUT LARGE MICROSAT CLUSTERS #2 AS WELL AS MULTIPLE-ALIGNMENT-BLOCKS-SPANNING MICROSATS (KIND OF #3 IMPLICIT IN THE FIRST PART OF THE SENTENCE ITSELF IN MOST CASES). my %orths=(); my $counterm = 0; my $loaded = 0; my %seen = (); my @allowedchrs = (); # print "no = $no\n"; #<STDIN>; while (my $line = <ORTH>){ #print "line=$line\n"; $line =~ s/>hg18/>H/g; $line =~ s/>panTro2/>C/g; $line =~ s/>ponAbe2/>O/g; $line =~ s/>rheMac2/>R/g; $line =~ s/>calJac1/>M/g; my @micros = split(/>/,$line); # LOADING ALL THE MICROSAT ENTRIES FROM THE CLUSTER INTO @micros #print "micros=",printarr(@micros),"\n"; #<STDIN>; shift @micros; # EMPTYING THE FIRST, EMTPY ELEMENT OF THE ARRAY $no_of_species = adjustCoordinates($micros[0]); next if $no_of_species != $no; $counterm++; #------------------------------------------------ $nextcounter[0]++ if $line =~ /compound/; next if $line =~ /compound/; # GETTING RID OF COMPOUND MICROSATS #------------------------------------------------ #next if $line =~ /[A-Za-z]>[a-zA-Z]/; #------------------------------------------------ chomp $line; my $match_count = ($line =~ s/>/>/g); # COUNTING THE NUMBER OF MICROSAT ENTRIES IN THE CLUSTER #print "number of species = $match_count\n"; my $stopper = 0; foreach my $mic (@micros){ my @local = split(/\t/,$mic); if ($local[$typecord] =~ /\./ || exists($local[$no_of_interruptionscord+2])) {$stopper = 1; $nextcounter[1]++; last; } # REMOVING CLUSTERS WITH THE CYRPTIC, (UNRESOLVABLY COMPLEX) MICROSAT ENTRIES IN THEM } next if $stopper ==1; #------------------------------------------------ $nextcounter[2]++ if (scalar(@micros) >$no_of_species); next if (scalar(@micros) >$no_of_species); #1 REMOVING MICROSAT CLUSTERS WITH MORE NUMBER OF MICROSAT ENTRIES THAN THE NUMBER OF SPECIES IN THE DATASET. #2 THIS IS SO BECAUSE SUCH CLUSTERS IMPLY THAT IN AT LEAST ONE SPECIES, THERE IS MORE THAN ONE MICROSAT ENTRY #3 IN THE CLUSTER. THUS, HERE WE ARE GETTING RID OF MICROSATS CLUSTERS THAT INCLUDE MULTUPLE, NEIGHBORING #4 MICROSATS, AND STICK TO CLEAN MICROSATS THAT DO NOT HAVE ANY MICROSATS IN NEIGHBORHOOD. #5 THIS 'NEIGHBORHOOD-RANGE' HAD BEEN DECIDED PREVIOUSLY IN OUR CODE multiSpecies_orthFinder4.pl my $nexter = 0; foreach my $tag (@tags){ my $tagcount = ($line =~ s/>$tag\t/>$tag\t/g); if ($tagcount > 1) { $nexter =1; #print colored ['red'],"multiple entires per species : $tagcount of $tag\n" if $printer == 1; next; } } if ($nexter == 1){ $nextcounter[3]++; next; } #------------------------------------------------ foreach my $mic (@micros){ #1 REMOVING MICROSATELLITES WITH ANY 'N's IN THEM my @local = split(/\t/,$mic); if ($local[$microsatcord] =~ /N/) {$stopper =1; $nextcounter[4]++; last;} } next if $stopper ==1; #print "till here 1\n"; #<STDIN>; #------------------------------------------------ my @micros_copy = @micros; my $tempmicro = shift(@micros_copy); #1 CURRENTLY OBTAINING INFORMATION FOR THE FIRST #2 MICROSAT IN THE CLUSTER. my @tempfields = split(/\t/,$tempmicro); my $prevtype = $tempfields[$typecord]; my $tempmotif = $tempfields[$motifcord]; my $tempfirstmotif = (); if (scalar(@tempfields) > $microsatcord + 2){ if ($tempfields[$no_of_interruptionscord] >= 1) { #1 DISCARDING MICROSATS WITH MORE THAN ZERO INTERRUPTIONS #2 IN THE FIRST MICROSAT OF THE CLUSTER $nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; } } if ($nexter == 1){ $nextcounter[6]++; next; } #1 DONE OBTAINING INFORMATION REGARDING #2 THE FIRST MICROSAT FROM THE CLUSTER if ($tempmotif =~ /^\[/){ $tempmotif =~ s/^\[//g; $tempmotif =~ /([a-zA-Z]+)\].*/; $tempfirstmotif = $1; #1 OBTAINING THE FIRTS MOTIF OF MICROSAT } else {$tempfirstmotif = $tempmotif;} my $prevmotif = $tempfirstmotif; my $key = (); if ($tempmicro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { $key = join("\t",$1, $2, $4, $5); } else{ # print "counld not form a key \n" if $printer == 1; $nextcounter[7]++; next; } #----------------- #1 NOW, AFTER OBTAINING INFORMATION ABOUT #2 THE FIRST MICROSAT IN THE CLUSTER, THE #3 FOLLOWING LOOP GOES THROUGH THE OTHER MICROSATS #4 TO SEE IF THEY SHARE THE REQUIRED FEATURES (BELOW) foreach my $micro (@micros_copy){ my @fields = split(/\t/,$micro); #----------------- if (scalar(@fields) > $microsatcord + 2){ #1 DISCARDING MICROSATS WITH MORE THAN ONE INTERRUPTIONS if ($fields[$no_of_interruptionscord] >= 1) {$nexter =1; #print colored ['blue'],"more than one interruptions \n" if $printer == 1; $nextcounter[6]++; last; } } #----------------- if (($prevtype ne "0") && ($prevtype ne $fields[$typecord])) { $nexter =1; #print colored ['yellow'],"microsat of different type \n" if $printer == 1; $nextcounter[8]++; last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG #----------------- #2 TO DIFFERENT TYPES (MONOS, DIS, TRIS ETC.) $prevtype = $fields[$typecord]; my $motif = $fields[$motifcord]; my $firstmotif = (); if ($motif =~ /^\[/){ $motif =~ s/^\[//g; $motif =~ /([a-zA-Z]+)\].*/; $firstmotif = $1; } else {$firstmotif = $motif;} my $motifpattern = $firstmotif.$firstmotif; my $prevmotifpattern = $prevmotif.$prevmotif; if (($prevmotif ne "0")&&(($motifpattern !~ /$prevmotif/i)||($prevmotifpattern !~ /$firstmotif/i)) ) { $nexter =1; #print colored ['green'],"different motifs used \n$line\n" if $printer == 1; $nextcounter[9]++; last; } #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSATS BELONG #2 TO DIFFERENT MOTIFS my $prevmotif = $firstmotif; #----------------- for my $t (0 ... $#tags){ #1 DISCARDING MICROSAT CLUSTERS WHERE MICROSAT ENTRIES BELONG #2 DIFFERENT ALIGNMENT BLOCKS if ($micro =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { my $key2 = join("\t",$1, $2, $4, $5); if ($key2 ne $key){ # print "microsats belong to diffferent alignment blocks altogether\n" if $printer == 1; $nextcounter[10]++; $nexter = 1; last; } } else{ # print "counld not form a key \n" if $printer == 1; $nexter = 1; last; } } } ##################### if ($nexter == 1){ # print "nexting\n" if $printer == 1; next; } else{ # print "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n$key:\n$line\nvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv\n" if $printer == 1; push (@{$orths{$key}},$line); $loaded++; if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { # print "$line\n" if $printer == 1; #if $line =~ /Contig/; # print "################ ################\n" if $printer == 1; push @allowedchrs, $2 if !exists $allowedhash{$2}; $allowedhash{$2} = 1; my $key = join("\t",$1, $2, $3, $4); #print "print the shit: $key\n" if $printer == 1; $seen{$key} = 1; } else { #print "Key could not be formed in SPUT for ($org) ($title) ([0-9]+) ([0-9]+)\n"; } } } close ORTH; # print "now studying where we lost microsatellites: @nextcounter\n"; for my $reason (0 ... $#nextcounter){ # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; } # print "\ntotal number of keys formed = ", scalar(keys %orths), " = \n"; # print "done filtering .. counterm = $counterm and loaded = $loaded\n"; #---------------------------------------------------------------------------------------------------------------- # NOW GENERATING THE ALIGNMENT FILE WITH RELELEVENT ALIGNMENTS STORED ONLY. while (1){ if (-e $megamatchlck){ # print "waiting to write into $megamatchlck\n"; sleep 10; } else{ open (MEGAMLCK, ">$megamatchlck") or die "Cannot open megamatchlck file $megamatchlck: $!"; open (MEGAM, ">$megamatch") or die "Cannot open megamatch file $megamatch: $!"; last; } } foreach my $seqfile (@filterseqfiles){ my $fullpath = $seqfile; # print "opening file: $fullpath\n"; open (MATCH, "<$fullpath") or die "Cannot open MATCH file $fullpath: $!"; my $matchlines = 0; while (my $line = <MATCH>) { if ($line =~ /($focalspec)\s([a-zA-Z0-9]+)\s([0-9]+)\s([0-9]+)/ ) { my $key = join("\t",$1, $2, $3, $4); if (exists $seen{$key}){ while (1){ $matchlines++; print MEGAM $line; $line = <MATCH>; print MEGAM "\n" if $line !~/[0-9a-zA-Z]/; last if $line !~/[0-9a-zA-Z]/; } } } } # print "matchlines = $matchlines\n"; close MATCH; } close MEGAMLCK; unlink $megamatchlck; close MEGAM; undef %seen; #---------------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------- # NOW, AFTER FILTERING MANY MICROSATS, AND LOADING THE FILTERED ONES INTO # THE HASH %orths , WE GO THROUGH THE ALIGNMENT FILE, AND STUDY THE # FLANKING SEQUENCES OF ALL THESE MICROSATS, TO FILTER THEM FURTHER #$printer = 1; my $microreadcounter=0; my $contigsentered=0; my $contignotrightcounter=0; my $keynotformedcounter=0; my $keynotfoundcounter= 0; my $dotcounter = 0; open (BO, "<$megamatch") or die "Cannot open alignment file: $megamatch: $!"; while (my $line = <BO>){ # print "." if $dotcounter % 100 ==0; # print "\n" if $dotcounter % 5000 ==0; # print "dotcounter = $dotcounter\n " if $printer == 1; next if $line !~ /^[0-9]+/; $dotcounter++; # print colored ['green'], "~" x 60, "\n" if $printer == 1; # print colored ['green'], $line;# if $printer == 1; chomp $line; my @fields2 = split(/\t/,$line); my $key2 = (); my $alignment_no = (); #1 TEMPORARY if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { $key2 = join("\t",$1, $2, $4, $5); $alignment_no=$1; } else {print "seq line $line incompatible\n"; $keynotformedcounter++; next;} $no_of_species = adjustCoordinates($line); $contignotrightcounter++ if $no_of_species != $no; # print "contignotrightcounter=$contignotrightcounter\n"; # print "no_of_species=$no_of_species\n"; # print "no=$no\n"; next if $no_of_species != $no; # print "key = $key2\n" if $printer == 1; my @clusters = (); #1 EXTRACTING MICROSATS CORRESPONDING TO THIS #2 ALIGNMENT BLOCK if (exists($orths{$key2})){ @clusters = @{$orths{$key2}}; $contigsentered++; delete $orths{$key2}; } else{ #print "orth does not exist\n"; $keynotfoundcounter++; next; } my %sequences=(); #1 WILL STORE SEQUENCES IN THE CURRENT ALIGNMENT BLOCK my $humseq = (); foreach my $tag (@tags){ #1 READING THE ALIGNMENT FILE AND CAPTURING SEQUENCES my $seq = <BO>; #2 OF ALL SPECIES. chomp $seq; $sequences{$tag} = " ".$seq; #print "sequences = $sequences{$tag}\n" if $printer == 1; $humseq = $seq if $tag =~ /H/; } foreach my $cluster (@clusters){ #1 NOW, GOING THROUGH THE CLUSTER OF MICROSATS #print "x" x 60, "\n" if $printer == 1; #print colored ['red'],"cluster = $cluster\n"; $largesttree =~ s/hg18/H/g; $largesttree =~ s/panTro2/C/g; $largesttree =~ s/ponAbe2/O/g; $largesttree =~ s/rheMac2/R/g; $largesttree =~ s/calJac1/M/g; $microreadcounter++; my @micros = split(/>/,$cluster); shift @micros; my $edge_microsat=0; #1 THIS WILL HAVE VALUE "1" IF MICROSAT IS FOUND #2 TO BE TOO CLOSE TO THE EDGES OF ALIGNMENT BLOCK my @starts= (); my %start_hash=(); #1 STORES THE START AND END COORDINATES OF MICROSATELLITES my @ends = (); my %end_hash=(); #2 SO THAT LATER, WE WILL BE ABLE TO FIND THE EXTREME #3 COORDINATE VALUES OF THE ORTHOLOGOUS MIROSATELLITES. my %microhash=(); my %microsathash=(); my %nonmicrosathash=(); my $motif=(); #1 BASIC MOTIF OF THE MICROSATELLITE.. THERE'S ONLY 1 #print "tags=@tags\n"; for my $i (0 ... $#tags){ #1 FINDING THE MICROSAT, AND THE ALIGNMENT SEQUENCE #2 CORRESPONDING TO THE PARTICULAR SPECIES (AS PER #3 THE VARIABLE $TAG; my $tag = $tags[$i]; # print $seq; my $locus="NULL"; #1 THIS WILL STORE THE MICROSAT OF THIS SPECIES. #2 IF THERE IS NO MICROSAT, IT WILL REMAIN "NULL" foreach my $micro (@micros){ # print "micro=$micro, tag=$tag\n"; if ($micro =~ /^$tag/){ #1 MICROSAT OF THIS SPECIES FOUND.. $locus = $micro; my @fields = split(/\t/,$micro); $motif = $fields[$motifcord]; $microsathash{$tag}=$fields[$microsatcord]; # print "fields=@fields, and startcord=$startcord = $fields[$startcord]\n"; push(@starts, $fields[$startcord]); push(@ends, $fields[$endcord]); $start_hash{$tag}=$fields[$startcord]; $end_hash{$tag}=$fields[$endcord]; last; } else{$microsathash{$tag}="NULL"} } $microhash{$tag}=$locus; } my $extreme_start = smallest_number(@starts); #1 THESE TWO ARE THE EXTREME COORDINATES OF THE my $extreme_end = largest_number(@ends); #2 MICROSAT CLUSTER ACCROSS ALL THE SPECIES IN #3 WHOM IT IS FOUND TO BE ORTHOLOGOUS. #print "starts=@starts... ends=@ends\n"; my %up_flanks = (); #1 CONTAINS UPSTEAM FLANKING REGIONS FOR EACH SPECIES my %down_flanks = (); #1 CONTAINS DOWNDTREAM FLANKING REGIONS FOR EACH SPECIES my %up_largeflanks = (); my %down_largeflanks = (); my %locusandflanks = (); my %locusandlargeflanks = (); my %up_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_start and the #2 ACTUAL START OF MICROSATELLITE IN THE SPECIES my %down_internal_flanks=(); #1 CONTAINS SEQUENCE BETWEEN THE $extreme_end and the #2 ACTUAL end OF MICROSATELLITE IN THE SPECIES my %alignment=(); #1 CONTAINS ACTUAL ALIGNMENT SEQUENCE BETWEEN THE TWO #2 EXTEME VALUES. my %microsatstarts=(); #1 WITHIN EACH ALIGNMENT, IF THERE EXISTS A MICROSATELLITE #2 THIS HASH CONTAINS THE START SITE OF THE MICROSATELLITE #3 WIHIN THE ALIGNMENT next if !defined $extreme_start; next if !defined $extreme_end; next if $extreme_start > length($sequences{$tags[0]}); next if $extreme_start < 0; next if $extreme_end > length($sequences{$tags[0]}); for my $i (0 ... $#tags){ #1 NOW THAT WE HAVE GATHERED INFORMATION REGARDING #2 SEQUENCE ALIGNMENT AND MICROSATELLITE COORDINATES #3 AS WELL AS THE EXTREME COORDINATES OF THE #4 MICROSAT CLUSTER, WE WILL PROCEED TO EXTRACT THE #5 FLANKING SEQUENCE OF ALL ORGS, AND STUDY IT IN #6 MORE DETAIL. my $tag = $tags[$i]; # print "tag=$tag.. seqlength = ",length($sequences{$tag})," extreme_start=$extreme_start and extreme_end=$extreme_end\n"; my $upstream_gaps = (substr($sequences{$tag}, 0, $extreme_start) =~ s/\-/-/g); #1 NOW MEASURING THE NUMBER OF GAPS IN THE UPSTEAM #2 AND DOWNSTREAM SEQUENCES OF THE MICROSATs IN THIS #3 CLUSTER. my $downstream_gaps = (substr($sequences{$tag}, $extreme_end) =~ s/\-/-/g); if (($extreme_start - $upstream_gaps )< $EDGE_DISTANCE || (length($sequences{$tag}) - $extreme_end - $downstream_gaps) < $EDGE_DISTANCE){ $edge_microsat=1; last; } else{ $up_flanks{$tag} = substr($sequences{$tag}, $extreme_start - $FLANK_SUPPORT, $FLANK_SUPPORT); $down_flanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $FLANK_SUPPORT); $up_largeflanks{$tag} = substr($sequences{$tag}, $extreme_start - $COMPLEXITY_SUPPORT, $COMPLEXITY_SUPPORT); $down_largeflanks{$tag} = substr($sequences{$tag}, $extreme_end+1, $COMPLEXITY_SUPPORT); $alignment{$tag} = substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); $locusandflanks{$tag} = $up_flanks{$tag}."[".$alignment{$tag}."]".$down_flanks{$tag}; $locusandlargeflanks{$tag} = $up_largeflanks{$tag}."[".$alignment{$tag}."]".$down_largeflanks{$tag}; if ($microhash{$tag} ne "NULL"){ $up_internal_flanks{$tag} = substr($sequences{$tag}, $extreme_start , $start_hash{$tag}-$extreme_start); $down_internal_flanks{$tag} = substr($sequences{$tag}, $end_hash{$tag} , $extreme_end-$end_hash{$tag}); $microsatstarts{$tag}=$start_hash{$tag}-$extreme_start; # print "tag = $tag, internal flanks = $up_internal_flanks{$tag} and $down_internal_flanks{$tag} and start = $microsatstarts{$tag}\n" if $printer == 1; } else{ $nonmicrosathash{$tag}=substr($sequences{$tag}, $extreme_start, $extreme_end-$extreme_start+1); } # print "up flank for species $tag = $up_flanks{$tag} \ndown flank for species $tag = $down_flanks{$tag} \n" if $printer == 1; } } $nextcounter[11]++ if $edge_microsat==1; next if $edge_microsat==1; my $low_complexity = 0; #1 VALUE WILL BE 1 IF ANY OF THE FLANKING REGIONS #2 IS FOUND TO BE OF LOW COMPLEXITY, BY USING THE #3 FUNCTION sub test_complexity for my $i (0 ... $#tags){ # print "i = $tags[$i]\n" if $printer == 1; if (test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW" || test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT) eq "LOW"){ # print "i = $i, low complexity regions: $up_largeflanks{$tags[$i]}: ",test_complexity($up_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT), " and $down_largeflanks{$tags[$i]} = ",test_complexity($down_largeflanks{$tags[$i]}, $COMPLEXITY_SUPPORT),"\n" if $printer == 1; $low_complexity =1; last; } } $nextcounter[12]++ if $low_complexity==1; next if $low_complexity == 1; my $sequence_dissimilarity = 0; #1 THIS VALYE WILL BE 1 IF THE SEQUENCE SIMILARITY #2 BETWEEN ANY OF THE SPECIES AGAINST THE HUMAN #3 FLANKING SEQUENCES IS BELOW A CERTAIN THRESHOLD #4 AS DESCRIBED IN FUNCTION sub sequence_similarity my %donepair = (); for my $i (0 ... $#tags){ # print "i = $tags[$i]\n" if $printer == 1; # next if $i == 0; # print colored ['magenta'],"THIS IS UP\n" if $printer == 1; for my $b (0 ... $#tags){ next if $b == $i; my $pair = (); $pair = $i."_".$b if $i < $b; $pair = $b."_".$i if $b < $i; next if exists $donepair{$pair}; my ($up_similarity,$upnucdiffs, $upindeldiffs) = sequence_similarity($up_flanks{$tags[$i]}, $up_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); my ($down_similarity,$downnucdiffs, $downindeldiffs) = sequence_similarity($down_flanks{$tags[$i]}, $down_flanks{$tags[$b]}, $SIMILARITY_THRESH, $info); $donepair{$pair} = $up_similarity."_".$down_similarity; # print RUN "$up_similarity $upnucdiffs $upindeldiffs $down_similarity $downnucdiffs $downindeldiffs\n"; if ( $up_similarity < $SIMILARITY_THRESH || $down_similarity < $SIMILARITY_THRESH){ $sequence_dissimilarity =1; last; } } } $nextcounter[13]++ if $sequence_dissimilarity==1; next if $sequence_dissimilarity == 1; my ($simplified_microsat, $Hchrom, $Hstart, $Hend, $locusmotif, $locusmotifsize) = summarize_microsat($cluster, $humseq); # print "simplified_microsat=$simplified_microsat\n"; <STDIN>; my ($tree_analysis, $alternative_trees, $conformation) = treeStudy($simplified_microsat); if (exists $treesToReject{$tree_analysis}){ $nextcounter[14]++; next; } # my $adjuster=(); # if ($no_of_species == 4){ # my @sields = split(/\t/,$simplified_microsat); # my $somend = pop(@sields); # my $somestart = pop(@sields); # my $somechr = pop(@sields); # $adjuster = "NA\t" x 13 ; # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; # } # if ($no_of_species == 3){ # my @sields = split(/\t/,$simplified_microsat); # my $somend = pop(@sields); # my $somestart = pop(@sields); # my $somechr = pop(@sields); # $adjuster = "NA\t" x 26 ; # $simplified_microsat = join ("\t", @sields, $adjuster).$somechr."\t".$somestart."\t".$somend; # } # $registeredTrees{$tree_analysis} = 1 if !exists $registeredTrees{$tree_analysis}; $registeredTrees{$tree_analysis}++ if exists $registeredTrees{$tree_analysis}; if (exists $treesToIgnore{$tree_analysis}){ my @appendarr = (); print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; #print "SUMMARY ",$Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize], "\t", $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","NOEVENT", "\t\t", $cluster,"\n"; foreach my $lnode (@$lagestnodes){ my @pair = @$lnode; my @nodemutarr = (); for my $p (@pair){ my @mutinfoarray1 = (); for (1 ... 38){ push (@mutinfoarray1, "NA") } print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; } } for (1 ... 38){ push (@appendarr, "NA") } print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; # print "SUMMARY ",join ("\t", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; next; } my ($mutations_array, $nodes, $branches_hash, $alivehash, $primaryalignment) = peel_onion($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); if ($mutations_array eq "NULL"){ my @appendarr = (); print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n"; foreach my $lnode (@$lagestnodes){ my @pair = @$lnode; my @nodemutarr = (); for my $p (@pair){ my @mutinfoarray1 = (); for (1 ... 38){ push (@mutinfoarray1, "NA") } print SUMMARY join ("\t", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; # print join ("\t", "SUMMARY", @mutinfoarray1[0...($#mutinfoarray1)] ),"\t"; } } for (1 ... 38){ push (@appendarr, "NA") } print SUMMARY join ("\t", @appendarr,"NULL", "NULL"),"\n"; # print join ("\t","SUMMARY", @appendarr,"NULL", "NULL"),"\n"; #<STDIN>; next; } # print "sent: \n" if $printer == 1; # print "nodes = @$nodes, branches array:\n" if $mutations_array ne "NULL" && $printer == 1; my ($newstatus, $newmutations_array, $newnodes, $newbranches_hash, $newalivehash, $finalalignment) = fillAlignmentGaps($tree, \%sequences, \%alignment, \@tags, \%microsathash, \%nonmicrosathash, $motif, $tree_analysis, $thresholdhash{length($motif)}, \%microsatstarts); # print "newmutations_array returned = \n",join("\n",@$newmutations_array),"\n" if $newmutations_array ne "NULL" && $printer == 1; my @finalmutations_array= (); @finalmutations_array = selectMutationArray($mutations_array, $newmutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array ne "NULL"; @finalmutations_array = selectMutationArray($mutations_array, $mutations_array, \@tags, $alivehash, \%alignment, $motif) if $newmutations_array eq "NULL"; my ($besttree, $treescore) = selectBetterTree($tree_analysis, $alternate{$tree_analysis}, \@finalmutations_array); my $cleancase = "UNCLEAN"; $cleancase = checkCleanCase($besttree, $finalalignment) if $treescore > 0 && $finalalignment ne "NULL" && $finalalignment =~ /\!/; $cleancase = checkCleanCase($besttree, $primaryalignment) if $treescore > 0 && $finalalignment eq "NULL" && $primaryalignment =~ /\!/ && $primaryalignment ne "NULL"; $cleancase = "CLEAN" if $finalalignment eq "NULL" && $primaryalignment !~ /\!/ && $primaryalignment ne "NULL"; $cleancase = "CLEAN" if $finalalignment ne "NULL" && $finalalignment !~ /\!/ ; $besttree = "NULL" if $treescore <= 0; print SUMMARY $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; # print "SUMMARY ", $Hchrom,"\t",$Hstart,"\t",$Hend,"\t",$locusmotif,"\t",$locusmotifsize,"\t", $thresharr[$locusmotifsize],"\t",$simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t"; # print SELECT $Hchrom,"\t",$Hstart,"\t",$Hend,"\t","EVENT", "\t\t", $cluster,"\n"; my @mutinfoarray =(); foreach my $lnode (@$lagestnodes){ my @pair = @$lnode; my $joint = "(".join(", ",@pair).")"; my @nodemutarr = (); for my $p (@pair){ foreach my $mut (@finalmutations_array){ $mut =~ /node=([A-Z, \(\)]+)/; push @nodemutarr, $mut if $p eq $1; } # print "from pair @pair, p=$p\n"; @mutinfoarray = summarizeMutations(\@nodemutarr, $besttree); print SUMMARY join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; # print "SUMMARY ",join ("\t", @mutinfoarray[0...($#mutinfoarray-1)] ),"\t"; } } @mutinfoarray = summarizeMutations(\@finalmutations_array, $besttree); print SUMMARY join ("\t", @mutinfoarray ),"\t"; print SUMMARY $cleancase, "\n"; # print "SUMMARY ",join ("\t", @mutinfoarray,$cleancase ),"\n"; #<STDIN>; # print "summarized\n"; <STDIN>; my %indelcatch = (); my %substcatch = (); my %typecatch = (); my %nodescatch = (); my $mutconcat = join("\t", @finalmutations_array)."\n"; my %indelposcatch = (); my %subsposcatch = (); foreach my $fmut ( @finalmutations_array){ # next if $fmut !~ /indeltype=[a-zA-Z]+/; #print RUN $fmut, "\n"; $fmut =~ /node=([a-zA-Z, \(\)]+)/; my $lnode = $1; $nodescatch{$1}=1; if ($fmut =~ /type=substitution/){ # print "fmut=$fmut\n"; $fmut =~ /from=([a-zA-Z\-]+)\tto=([a-zA-Z\-]+)/; my $from=$1; # print "from=$from\n"; my $to=$2; # print "to=$to\n"; push @{$substcatch{$lnode}} , ("from:".$from." to:".$to); $fmut =~ /position=([0-9]+)/; push @{$subsposcatch{$lnode}}, $1; } if ($fmut =~ /insertion=[a-zA-Z\-]+/){ $fmut =~ /insertion=([a-zA-Z\-]+)/; push @{$indelcatch{$lnode}} , $1; $fmut =~ /indeltype=([a-zA-Z]+)/; push @{$typecatch{$lnode}}, $1; $fmut =~ /position=([0-9]+)/; push @{$indelposcatch{$lnode}}, $1; } if ($fmut =~ /deletion=[a-zA-Z\-]+/){ $fmut =~ /deletion=([a-zA-Z\-]+)/; push @{$indelcatch{$lnode}} , $1; $fmut =~ /indeltype=([a-zA-Z]+)/; push @{$typecatch{$lnode}}, $1; $fmut =~ /position=([0-9]+)/; push @{$indelposcatch{$lnode}}, $1; } } # print $simplified_microsat,"\t", $tree_analysis,"\t", join("",@tags), "\t" if $printer == 1; # print join ("<\t>", @mutinfoarray),"\n" if $printer == 1; # print "where mutinfoarray = @mutinfoarray\n" if $printer == 1; # #print RUN "."; # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; # print colored ['red'],"finalmutations_array=\n" if $printer == 1; foreach (@finalmutations_array) { # print colored ['red'], "$_\n" if $_ =~ /type=substitution/ && $printer == 1 ; # print colored ['yellow'], "$_\n" if $_ !~ /type=substitution/ && $printer == 1 ; }# if $line =~ /cal/;# && $line =~ /chr4/; # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; # print colored ['red'], "-------------------------------------------------------------\n" if $printer == 1; # print "tree analysis = $tree_analysis\n" if $printer == 1; # my $mutations = "@$mutations_array"; next; for my $keys (@$nodes) {foreach my $key (@$keys){ #print "key = $key, => $branches_hash->{$key}\n"; } # print "x" x 50, "\n"; } my ($birth_steps, $death_steps) = decipher_history($mutations_array,join("",@tags),$nodes,$branches_hash,$tree_analysis,$conformation, $alivehash, $simplified_microsat); } } close BO; # print "now studying where we lost microsatellites:"; # print "x" x 60,"\n"; for my $reason (0 ... $#nextcounter){ # print $removalReasons[$reason]."\t".$nextcounter[$reason],"\n"; } # print "x" x 60,"\n"; # print "In total we read $microreadcounter microsatellites after reading through $contigsentered contigs\n"; # print " we lost $keynotformedcounter contigs as they did not form the key, \n"; # print "$contignotrightcounter contigs as they were not of the right species configuration\n"; # print "$keynotfoundcounter contigs as they did not contain the microsats\n"; # print "... In total we went through a file that had $dotcounter contigs...\n"; # print join ("\n","remaining orth keys = ", (keys %orths),""); # print "now printing counted trees: \n"; if (scalar(keys %registeredTrees) > 0){ foreach my $keyb ( sort (keys %registeredTrees) ) { # print "$keyb : $registeredTrees{$keyb}\n"; } } } my @summarizarr = ("+C=+C +R.+C -HCOR,+C", "+H=+H +R.+H -HCOR,+H", "-C=-C -R.-C +HCOR,-C", "-H=-H -R.-H +HCOR,-H", "+HC=+HC", "-HC=-HC", "+O=+O -HCOR,+O", "-O=-O +HCOR,-O", "+HCO=+HCO", "-HCO=-HCO", "+R=+R +R.+C +R.+H", "-R=-R -R.-C -R.-H"); foreach my $line (@summarizarr){ next if $line !~ /[A-Za-z0-9]/; # print $line; chomp $line; my @fields = split(/=/,$line); # print "title = $fields[0]\n"; my @parts=split(/ +/, $fields[1]); my %partshash = (); foreach my $part (@parts){$partshash{$part}=1;} my $count=0; foreach my $key ( sort keys %registeredTrees ){ next if !exists $partshash{$key}; # print "now adding $registeredTrees{$key} from $key\n"; $count+=$registeredTrees{$key}; } # print "$fields[0] : $count\n"; } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub largest_number{ my $counter = 0; my($max) = shift(@_); foreach my $temp (@_) { #print "finding largest array: $maxcounter \n"; if($temp > $max){ $max = $temp; } } return($max); } sub smallest_number{ my $counter = 0; my($min) = shift(@_); foreach my $temp (@_) { #print "finding largest array: $maxcounter \n"; if($temp < $min){ $min = $temp; } } return($min); } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub baseml_parser{ my $outputfile = $_[0]; open(BOUT,"<$outputfile") or die "Cannot open output of upstream baseml $outputfile: $!"; my @info = (); my @branchields = (); my @distanceields = (); my @bout = <BOUT>; #print colored ['red'], @bout ,"\n"; for my $b (0 ... $#bout){ my $bine=$bout[$b]; #print colored ['yellow'], "sentence = ",$bine; if ($bine =~ /TREE/){ $bine=$bout[$b++]; $bine=$bout[$b++]; $bine=$bout[$b++]; #print "FOUND",$bine; chomp $bine; $bine =~ s/^\s+//g; @branchields = split(/\s+/,$bine); $bine=$bout[$b++]; chomp $bine; $bine =~ s/^\s+//g; @distanceields = split(/\s+/,$bine); #print "LASTING..............\n"; last; } else{ } } close BOUT; # print "branchfields = @branchields and distanceields = @distanceields\n" if $printer == 1; my %distance_hash=(); for my $d (0 ... $#branchields){ $distance_hash{$branchields[$d]} = $distanceields[$d]; } $info[0] = $distance_hash{"9..1"} + $distance_hash{"9..2"}; $info[1] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+ $distance_hash{"8..3"}; $info[2] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"7..4"}; $info[3] = $distance_hash{"9..1"} + $distance_hash{"8..9"}+$distance_hash{"7..8"}+$distance_hash{"6..7"}+$distance_hash{"6..5"}; # print "\nsending back: @info\n" if $printer == 1; return join("\t",@info); } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub test_complexity{ my $printer = 0; my $sequence = $_[0]; my $COMPLEXITY_SUPPORT = $_[1]; my $complexity=int($COMPLEXITY_SUPPORT * (1/40)); #1 THIS IS AN ARBITRARY THRESHOLD SET FOR LOW COMPLEXITY. #2 THE INSPIRATION WAS WEB MILLER'S MAIL SENT ON #3 19 Apr 2008 WHERE HE CLASSED AS HIGH COMPLEXITY #4 REGION, IF 40 BP OF SEQUENCE HAS AT LEAST 3 OF #5 EACH NUCLEOTIDE. HENCE, I NORMALIZE THIS PARAMETER #6 FOR THE ACTUAL LENGTH OF $FLANK_SUPPORT SET BY #7 THE USER. #8 WEB MILLER SENT THE MAIL TO YDK104@PSU.EDU my $As = ($sequence=~ s/A/A/gi); my $Ts = ($sequence=~ s/T/T/gi); my $Gs = ($sequence=~ s/G/G/gi); my $Cs = ($sequence=~ s/C/C/gi); #print "seq = $sequence, As=$As, Ts=$Ts, Gs=$Gs, Cs=$Cs\n" if $printer == 1; my $ans = (); return "HIGH" if $As >= $complexity && $Ts >= $complexity && $Cs >= $complexity && $Gs >= $complexity; my @nts = ("A","T","G","C","-"); my $lowcomplex = 0; foreach my $nt (@nts){ $lowcomplex =1 if $sequence =~ /(($nt\-*){10,})/i; # print "caught with a mono of $nt : $1 in $sequence\n" if $sequence =~ /(($nt\-*){10,})/i; $lowcomplex =1 if $sequence =~ /(($nt[A-Za-z]){10,})/i; $lowcomplex =1 if $sequence =~ /(([A-Za-z]$nt){10,})/i; # print "caught with a di with $nt : $2 in $sequence\n" if $sequence =~ /(($nt[A-Za-z]){10,})/i || $sequence =~ /(([A-Za-z]$nt){10,})/i; my $nont = ($sequence=~ s/$nt/$nt/gi); } # print "leaving for now.. $sequence\n" if $printer == 1 && $lowcomplex == 0; #<STDIN>; return "HIGH" if $lowcomplex == 0; return "LOW" ; } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub sequence_similarity{ my $printer = 0; my @seq1 = split(/\s*/, $_[0]); my @seq2 = split(/\s*/, $_[1]); my $similarity_thresh = $_[2]; my $info = $_[3]; # print "input = @_\n" if $printer == 1; my $seq1str = $_[0]; my $seq2str = $_[1]; $seq1str=~s/\-//g; $seq2str=~s/\-//g; my $similarity=0; my $nucdiffs=0; my $nucsims=0; my $indeldiffs=0; for my $i (0...$#seq1){ $similarity++ if $seq1[$i] =~ /$seq2[$i]/i ; #|| $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i ; $nucsims++ if $seq1[$i] =~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); $nucdiffs++ if $seq1[$i] !~ /$seq2[$i]/i && ($seq1[$i] =~ /[a-zA-Z]/i && $seq2[$i] =~ /[a-zA-Z]/i); $indeldiffs++ if $seq1[$i] !~ /$seq2[$i]/i && $seq1[$i] =~ /\-/i || $seq2[$i] =~ /\-/i; } my $sim = $similarity/length($_[0]); return ( $sim, $nucdiffs, $indeldiffs ); #<= $similarity_thresh; } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub load_treesToReject{ my @rejectlist = (); my $alltags = join("",@_); @rejectlist = qw (-HCOR +HCOR) if $alltags eq "HCORM"; @rejectlist = qw ( -HCO|+R +HCO|-R) if $alltags eq "HCOR"; @rejectlist = qw ( -HC|+O +HC|-O) if $alltags eq "HCO"; %treesToReject=(); $treesToReject{$_} = $_ foreach (@rejectlist); #print "loaded to reject for $alltags; ", $treesToReject{$_},"\n" foreach (@rejectlist); #<STDIN>; } #-------------------------------------------------------------------------------------------------------- sub load_treesToIgnore{ my @rejectlist = (); my $alltags = join("",@_); @rejectlist = qw (-HCOR +HCOR +HCORM -HCORM) if $alltags eq "HCORM"; @rejectlist = qw ( -HCO|+R +HCO|-R +HCOR -HCOR) if $alltags eq "HCOR"; @rejectlist = qw ( -HC|+O +HC|-O +HCO -HCO) if $alltags eq "HCO"; %treesToIgnore=(); $treesToIgnore{$_} = $_ foreach (@rejectlist); #print "loaded ", $treesToIgnore{$_},"\n" foreach (@rejectlist); } #-------------------------------------------------------------------------------------------------------- sub load_thresholds{ my @threshold_array=split(/[,_]/,$_[0]); unshift @threshold_array, "0"; for my $size (1 ... 4){ $thresholdhash{$size}=$threshold_array[$size]; } } #-------------------------------------------------------------------------------------------------------- sub load_allPossibleTrees{ #1 THIS FILE STORES ALL POSSIBLE SCENARIOS OF MICROSATELLITE #2 BIRTH AND DEATH EVENTS ON A 5-PRIMATE TREE OF H,C,O,R,M #3 IN FORM OF A TEXT FILE. THIS WILL BE USED AS A TEMPLET #4 TO COMPARE EACH MICROSATELLITE CLUSTER TO UNDERSTAND THE #5 EVOLUTION OF EACH LOCUS. WE WILL THEN DISCARD SOME #6 MICROSATS ACCRODING TO THEIR EVOLUTIONARY BEHAVIOUR ON #7 THE TREE. MOST PROBABLY WE WILL REMOVE THOSE MICROSATS #8 THAT ARE NOT SUFFICIENTLY INFORMATIVE, LIKE IN CASE OF #9 AN OUTGROUP MICROSATELLITE BEING DIFFERENT FRON ALL OTHER #10 SPECIES IN THE TREE. my $tree_list = $_[0]; # print "file to be loaded: $tree_list\n"; my @trarr = (); @trarr = ("#H C O CONCLUSION ALTERNATE", "+ + + +HCO NA", "+ _ _ +H NA", "_ + _ +C NA", "_ _ + -HC|+O NA", "+ _ + -C +H", "_ + + -H +C", "+ + _ +HC|-O NA", "_ _ _ -HCO NA") if $tree_list =~ /_HCO\.txt/; @trarr = ("#H C O R CONCLUSION ALTERNATE", "_ _ _ _ -HCOR NA", "+ + + + +HCOR NA", "+ + + _ +HCO|-R +H.+C.+O", "+ + _ _ +HC +H.+C;-O", "+ _ _ _ +H +HC,-C", "_ + _ _ +C +HC,-H", "_ _ + _ +O -HC|-H.-C", "_ _ + + -HC -H.-C", "+ _ _ + +H|-C.-O +HC,-C", "_ + _ + +C -H.-O", "_ + + _ -H +C.+O", "_ _ _ + -HCO|+R NA", "+ _ + _ +H.+O|-C NA", "_ + + + -H -HC,+C", "+ _ + + -C -HC,+H", "+ + _ + -O +HC") if $tree_list =~ /_HCOR\.txt/; @trarr = ("#H C O R M CONCLUSION ALTERNATE", "+ + + + _ +HCOR NA", "+ + + _ + -R +HCO;+HC.+O;+H.+C.+O", "+ + _ + + -O -HCO,+HC|-HCO,+HC;-HCO,(+H.+C)", "+ _ + + + -C -HC,+H;+HCO,(+H.+O)", "_ + + + + -H -HC,+C;-HCO,(+C.+O)", "_ _ _ _ + -HCOR NA", "_ _ _ + _ +R -HC.-O;-H.-C.-O", "_ _ + _ _ +O +HCO,-HC;+HCO,(-H.-C)", "_ + _ _ _ +C +HC,-H;+HCO,(-H.-O)", "+ _ _ _ _ +H +HC,-C;+HCO,(-C.-O)", "+ + + _ _ +HCO +H.+C.+O", "+ + _ + _ -O +R.+HC|-HCO,+HC;+H.+C.+R|-HCO,(+H.+C)", "+ _ + + _ -C -HC,+H;+H.+O.+R|-HCO,(+H.+O)", "_ + + + _ -H -HC,+C;+C.+O.+R|-HCO,(+C.+O)", "_ _ _ + + -HCO -HC.-O;-H.-C.-O", "_ _ + _ + +O +HCO,-HC;+HCO,(-H.-C)", "_ + _ _ + +C +HC,-H;+HCO,(-H.-O)", "+ _ _ _ + +H -HC,+H;+HCO,(-C.-O)", "+ + _ _ + +HC -R.-O|+HCO,-O|+H.+C;-HCO,+HC;-HCO,(+H.+C)", "+ _ + _ + -R.-C|+HCO,-C|+H.+O NA", "_ + + _ + -R.-H|+HCO,-H|+C.+O NA", "_ _ + + _ -HC +R.+O|-HCO,+O|+HCO,-HC", "_ + _ + _ +R.+C|-HCO,+C|-HC,+C +HCO,(-H.-O)", "+ _ _ + _ +R.+H|-C.-O +HCO,(-C.-O)", "+ _ _ + + -O.-C|-HCO,+H +R.+H;-HCO,(+R.+H)", "_ + _ + + -O.-H|-HCO,+C +R.+C;-HCO,(+R.+C)", "_ + + _ _ +HCO,-H|+O.+C NA", "+ _ + _ _ +HCO,-C|+O.+H NA", "_ _ + + + -HC -H.-C|-HCO,+O", "+ + _ _ _ +HC +H.+C|+HCO,-O|-HCO,+HC;-HCO,(+H.+C)", "+ + + + + +HCORM NA") if $tree_list =~ /_HCORM\.txt/; my $template_p = $_[1]; my $alternate_p = $_[2]; #1 THIS IS THE HASH IN WHICH INFORMATION FROM THE ABOVE FILE #2 GETS STORED, USING THE WHILE LOOP BELOW. HERE, THE KEY #3 OF EACH ROW IS THE EVOLUTIONARY CONFIGURATION OF A LOCUS #4 ON THE PRIMATE TREE, BASED ON PRESENCE/ABSENCE OF A MICROSAT #5 AT THAT LOCUS, LIKE SAY "+ + + _ _" .. EACH COLUMN BELONGS #6 TO ONE SPECIES; HERE THE COLUMN NAMES ARE "H C O R M". #7 THE VALUE FOR EACH ENTRY IS THE MEANING OF THE ABOVE #8 CONFIGURATION (I.E., CONFIGURAION OF THE KEY. HERE, THE #9 VALUE WILL BE +HCO, SIGNIFYING A BIRTH IN HUMAN-CHIMP-ORANG #10 COMMON ANCESTOR. THIS HASH HAS BEEN LOADED HERE TO BE USED #11 LATER BY THE SUBROUTINE sub treeStudy{} THAT STUDIES #12 EVOLUTIONARY CONFIGURAION OF EACH MICROSAT LOCUS, AS #13 MENTIONED ABOVE. my @keys_array=(); foreach my $line (@trarr){ next if $line =~ /^#/; chomp $line; my @fields = split("\t", $line); push @keys_array, $fields[0]; # print "loading: $fields[0]\n"; $template_p->{$fields[0]}[0] = $fields[1]; $template_p->{$fields[0]}[1] = 0; $alternate_p->{$fields[1]} = $fields[2]; } # print "loaded the trees with keys: @keys_array\n"; return $template_p, \@keys_array, $alternate_p; } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub checkCleanCase{ my $printer = 0; my $tree = $_[0]; my $finalalignment = $_[1]; #print "IN checkCleanCase: @_\n"; #<STDIN>; my @indivspecies = $tree =~ /[A-Z]/g; $finalalignment =~ s/\./_/g; my @captured = $finalalignment =~ /[A-Za-z, \(\):]+\![:A-Za-z, \(\)]/g; my $unclean = 0; foreach my $sp (@indivspecies){ foreach my $cap (@captured){ $cap =~ s/:[A-Za-z\-]+//g; my @sps = $cap =~ /[A-Z]+/g; my $spsc = join("", @sps); # print "checking whether imp species $sp is present in $cap i.e, in $spsc\n " if $printer == 1; if ($spsc =~ /$sp/){ # print "foind : $sp\n"; $unclean = 1; last; } } last if $unclean == 1; } #<STDIN>; return "CLEAN" if $unclean == 0; return "UNCLEAN"; } #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------- sub adjustCoordinates{ my $line = $_[0]; my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g; $infocord = 2 + (4*$no_of_species) - 1; $typecord = 2 + (4*$no_of_species) + 1 - 1; $motifcord = 2 + (4*$no_of_species) + 2 - 1; $gapcord = $motifcord+1; $startcord = $gapcord+1; $strandcord = $startcord+1; $endcord = $strandcord + 1; $microsatcord = $endcord + 1; $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; $interr_poscord = $microsatcord + 3; $no_of_interruptionscord = $microsatcord + 4; $interrcord = $microsatcord + 2; #print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $printer == 1; return $no_of_species; } sub printhash{ my $alivehash = $_[0]; my @tags = @$_[1]; # print "print hash\n"; foreach my $tag (@tags){ # print "$tag=",$alivehash->{$tag},"\n" if exists $alivehash->{$tag}; } return "\n" } sub peel_onion{ my $printer = 0; # print "received: @_\n" ; #<STDIN>; $printer = 0; my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; # print "in peel onion.. tree = $tree \n" if $printer == 1; my %sequence_hash=(); # for my $i (0 ... $#sequences){ $sequence_hash{$species[$i]}=$sequences->[$i]; } my %node_sequences=(); my %node_alignments = (); #NEW, Nov 28 2008 my @tags=(); my @locus_sequences=(); my %alivehash=(); foreach my $tag (@$tagarray) { #print "adding: $tag\n"; push(@tags, $tag); $node_sequences{$tag}=join ".",split(/\s*/,$microsathash->{$tag}) if $microsathash->{$tag} ne "NULL"; $alivehash{$tag}= $tag if $microsathash->{$tag} ne "NULL"; $node_sequences{$tag}=join ".",split(/\s*/,$nonmicrosathash->{$tag}) if $microsathash->{$tag} eq "NULL"; $node_alignments{$tag}=join ".",split(/\s*/,$alignment->{$tag}) ; push @locus_sequences, $node_sequences{$tag}; #print "adding to node_seq: $tag = ",$node_alignments{$tag},"\n"; } my ($nodes_arr, $branches_hash) = get_nodes($tree); my @nodes=@$nodes_arr; # print "recieved nodes = " if $printer == 1; # foreach my $key (@nodes) {print "@$key " if $printer == 1;} # print "\n" if $printer == 1; #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS foreach my $keys (@nodes){ my @pair = @$keys; my $joint = "(".join(", ",@pair).")"; my $copykey = join "", @pair; $copykey =~ s/[\W ]+//g; # print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; my $livestatus = 1; foreach my $copy (split(/\s*/,$copykey)){ $livestatus = 0 if !exists $alivehash{$copy}; } $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; # print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; } @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. my @mutations_array=(); my $joint = (); foreach my $node (@nodes){ my @pair = @$node; # print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; $joint = "(".join(", ",@pair).")"; my @pair_sequences=(); foreach my $tag (@pair){ # print "$tag: $node_alignments{$tag}\n" if $printer == 1; print $node_alignments{$tag},"\n" if $printer == 1; push @pair_sequences, $node_alignments{$tag}; } # print "ppeel onion joint = $joint , pair_sequences=>@pair_sequences< , pair=>@pair<\n" if $printer == 1; my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); $node_alignments{$joint}=$compared; push( @mutations_array,split(/:/,$substitutions_list)); # print "newly added to node_sequences: $node_alignments{$joint} and list of mutations =\n", join("\n",@mutations_array),"\n" if $printer == 1; } # print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=\%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; ## <STDIN> if $printer == 1; my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); # print "returning: ", $analayzed_mutations, \@nodes, $branches_hash,"\n" if scalar @mutations_array > 0 && $printer == 1; # print "returning: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; # print "final node alignment = $node_alignments{$joint}\n" if $printer == 1; # <STDIN> if $printer == 1; return ($analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint}) if scalar @mutations_array > 0; return ("NULL",\@nodes,$branches_hash, \%alivehash, "NULL") if scalar @mutations_array == 0; } #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# sub get_nodes{ my $printer = 0; my $tree=$_[0]; #$tree =~ s/ +//g; $tree =~ s/\t+//g; $tree=~s/;//g; print "tree=$tree\n" if $printer == 1; my @nodes = (); my @onions=($tree); my %branches=(); foreach my $bite (@onions){ $bite=~ s/^\(|\)$//g; chomp $bite; # print "tree = $bite \n"; # <STDIN>; $bite=~ /([ ,\(\)A-Z]+)\,\s*([ ,\(\)A-Z]+)/; #$tree =~ /(\(\(\(H, C\), O\), R\))\, (M)/; my @raw_nodes = ($1, $2); print "raw nodes = $1 and $2\n" if $printer == 1; push(@nodes, [@raw_nodes]); foreach my $node (@raw_nodes) {push (@onions, $node) if $node =~ /,/;} foreach my $node (@raw_nodes) {$branches{$node}="(".$bite.")"; print "adding to branches: $node = ($bite)\n" if $printer == 1;} print "onions = @onions\n" if $printer == 1;<STDIN> if $printer == 1; } $printer = 0; return \@nodes, \%branches; } #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# sub analyze_mutations{ my ($mutations_array, $nodes, $branches_hash, $alignment, $tags, $alivehash, $node_sequences, $microsatstarts, $motif) = @_; my $locuslength = length($alignment->{$tags->[0]}); my $printer = 0; # print " IN analyzed_mutations....\n" if $printer == 1; # \n mutations array = @$mutations_array, \nAND locuslength = $locuslength\n" if $printer == 1; my %mutation_hash=(); my %froms_megahash=(); my %tos_megahash=(); my %position_hash=(); my @solutions_array=(); foreach my $mutation (@$mutations_array){ # print "loadin mutation: $mutation\n" if $printer == 1; my %localhash= $mutation =~ /([\S ]+)=([\S ]+)/g; $mutation_hash{$localhash{"position"}} = {%localhash}; push @{$position_hash{$localhash{"position"}}},$localhash{"node"}; # print "feeding position hash with $localhash{position}: $position_hash{$localhash{position}}[0]\n" if $printer == 1; $froms_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"from"}; $tos_megahash{$localhash{"position"}}{$localhash{"node"}}=$localhash{"to"}; # print "just a trial: $mutation_hash{$localhash{position}}{position}\n" if $printer == 1; # print "loadin in tos_megahash: $localhash{position} {$localhash{node} = $localhash{to}\n" if $printer == 1; # print "loadin in from: $localhash{position} {$localhash{node} = $localhash{from}\n" if $printer == 1; } # print "now going through each position in loculength:\n" if $printer == 1; ## <STDIN> if $printer == 1; for my $pos (0 ... $locuslength-1){ # print "at position: $pos\n" if $printer == 1; if (exists($mutation_hash{$pos})){ my @local_nodes=@{$position_hash{$pos}}; # print "found mutation: @{$position_hash{$pos}} : @local_nodes\n" if $printer == 1; foreach my $local_node (@local_nodes){ # print "at local node: $local_node ... from state = $froms_megahash{$pos}{$local_node}\n" if $printer == 1; my $open_insertion=(); my $open_deletion=(); my $open_to_substitution=(); my $open_from_substitution=(); if ($froms_megahash{$pos}{$local_node} eq "-"){ # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}} && $printer == 1; #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; $open_insertion=$tos_megahash{$pos}{$local_node}; for my $posnext ($pos+1 ... $locuslength-1){ # print "in first if .... studying posnext: $posnext\n" if $printer == 1; last if !exists ($froms_megahash{$posnext}{$local_node}); # print "for posnext: $posnext, there exists $froms_megahash{$posnext}{$local_node}.. already, open_insertion = $open_insertion.. checking is $froms_megahash{$posnext}{$local_node} matters\n" if $printer == 1; $open_insertion = $open_insertion.$tos_megahash{$posnext}{$local_node} if $froms_megahash{$posnext}{$local_node} eq "-"; # print "now open_insertion=$open_insertion\n" if $printer == 1; delete $mutation_hash{$posnext} if $froms_megahash{$posnext}{$local_node} eq "-"; } print "1 Feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion="),"\n" if $printer == 1; push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_insertion", "deletion=")); } elsif ($tos_megahash{$pos}{$local_node} eq "-"){ # print "here exists a microsatellite to $local_node from $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; $open_deletion=$froms_megahash{$pos}{$local_node}; for my $posnext ($pos+1 ... $locuslength-1){ print "in 1st elsif studying posnext: $posnext\n" if $printer == 1; print "nexting as nextpos does not exist\n" if !exists ($tos_megahash{$posnext}{$local_node}) && $printer == 1; last if !exists ($tos_megahash{$posnext}{$local_node}); print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; $open_deletion = $open_deletion.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} eq "-"; delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} eq "-"; } print "2 Feeding in:", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion"), "\n" if $printer == 1; push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_deletion")); } elsif ($tos_megahash{$pos}{$local_node} ne "-"){ # print "here exists a microsatellite from $local_node to $branches_hash->{$local_node}\n" if $printer == 1 && exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}};; # print "for localnode $local_node, amd the realated branches_hash:$branches_hash->{$local_node}, nexting as exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}\n" if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; #next if exists $alivehash->{$local_node} && exists $alivehash->{$branches_hash->{$local_node}}; # print "microsatstart = $microsatstarts->{$local_node} \n" if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node} && $printer == 1; next if exists $microsatstarts->{$local_node} && $pos < $microsatstarts->{$local_node}; $open_to_substitution=$tos_megahash{$pos}{$local_node}; $open_from_substitution=$froms_megahash{$pos}{$local_node}; print "open from substitution: $open_from_substitution \n" if $printer == 1; for my $posnext ($pos+1 ... $locuslength-1){ #print "in last elsif studying posnext: $posnext\n"; last if !exists ($tos_megahash{$posnext}{$local_node}); print "for posnext: $posnext, there exists $tos_megahash{$posnext}{$local_node}\n" if $printer == 1; $open_to_substitution = $open_to_substitution.$tos_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; $open_from_substitution = $open_from_substitution.$froms_megahash{$posnext}{$local_node} if $tos_megahash{$posnext}{$local_node} ne "-"; delete $mutation_hash{$posnext} if $tos_megahash{$posnext}{$local_node} ne "-" && $froms_megahash{$posnext}{$local_node} ; } print "open from substitution: $open_from_substitution \n" if $printer == 1; #IS THE STRETCH OF SUBSTITUTION MICROSATELLITE-LIKE? my @motif_parts=split(/\s*/,$motif); #GENERATING THE FLEXIBLE LEFT END my $left_query=(); for my $k (1 ... $#motif_parts) { $left_query= $motif_parts[$k]."|)"; $left_query="(".$left_query; } $left_query=$left_query."?"; print "left_quewry = $left_query\n" if $printer == 1; #GENERATING THE FLEXIBLE RIGHT END my $right_query=(); for my $k (0 ... ($#motif_parts-1)) { $right_query= "(|".$motif_parts[$k]; $right_query=$right_query.")"; } $right_query=$right_query."?"; print "right_query = $right_query\n" if $printer == 1; print "Hence, searching for: ^$left_query($motif)+$right_query\$\n" if $printer == 1; my $motifcomb=$motif x 50; print "motifcomb = $motifcomb\n" if $printer == 1; if ( ($motifcomb =~/$open_to_substitution/i) && (length ($open_to_substitution) >= length($motif)) ){ print "sequence microsat-like\n" if $printer == 1; my $all_microsat_like = 0; print "3 feeding in: ", join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution"), "\n" if $printer == 1; push (@solutions_array, join("\t", "node=$local_node","type=deletion" ,"position=$pos", "from=", "to=", "insertion=", "deletion=$open_from_substitution")); print "4 feeding in: ", join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion="), "\n" if $printer == 1; push (@solutions_array, join("\t", "node=$local_node","type=insertion" ,"position=$pos", "from=", "to=", "insertion=$open_to_substitution", "deletion=")); } else{ print "5 feeding in: ", join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion="), "\n" if $printer == 1; push (@solutions_array, join("\t", "node=$local_node","type=substitution" ,"position=$pos", "from=$open_from_substitution", "to=$open_to_substitution", "insertion=", "deletion=")); } #IS THE FROM-SEQUENCE MICROSATELLITE-LIKE? } #<STDIN> if $printer ==1; } #<STDIN> if $printer ==1; } } print "\n", "#" x 50, "\n" if $printer == 1; foreach my $tag (@$tags){ print "$tag: $alignment->{$tag}\n" if $printer == 1; } print "\n", "#" x 50, "\n" if $printer == 1; print "returning SOLUTIONS ARRAY : \n",join("\n", @solutions_array),"\n" if $printer == 1; #print "end\n"; #<STDIN> if return \@solutions_array; } #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# #+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++#+++++++++++# sub base_by_base_simple{ my $printer = 0; my ($motif, $locus, $no, $pair0, $pair1, $joint) = @_; my @seq_array=(); print "IN SUBROUTUNE base_by_base_simple.. information received = @_\n" if $printer == 1; print "pair0 = $pair0 and pair1 = $pair1\n" if $printer == 1; my @example=split(/\./,$locus->[0]); print "example, for length = @example\n" if $printer == 1; for my $i (0...$no-1){push(@seq_array, [split(/\./,$locus->[$i])]); print "for $i, from $locus->[$i], seq_array = >@{$seq_array[$i]}<\n" if $printer == 1;} my @compared_sequence=(); my @substitutions_list; for my $i (0...scalar(@example)-1){ #print "i = $i\n" if $printer == 1; #print "comparing $seq_array[0][$i] and $seq_array[1][$i] \n" ;#if $printer == 1; if ($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] !~ /!/){ my $resolution= resolve_base($seq_array[0][$i],$seq_array[1][$i], $pair1 ,"keep" ); # print "ancestral = $resolution\n" if $printer == 1; if ($resolution =~ /$seq_array[1][$i]/i && $resolution !~ /!/){ push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution ); } elsif ( $resolution !~ /!/){ push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); } push @compared_sequence,$resolution; } elsif ($seq_array[0][$i] !~ /!/ && $seq_array[1][$i] =~ /!/){ my $resolution= resolve_base($seq_array[1][$i],$seq_array[0][$i], $pair0, "invert" ); # print "ancestral = $resolution\n" if $printer == 1; if ($resolution =~ /$seq_array[0][$i]/i && $resolution !~ /!/){ push @substitutions_list, add_mutation($i, $pair1, $seq_array[1][$i], $resolution); } elsif ( $resolution !~ /!/){ push @substitutions_list, add_mutation($i, $pair0, $seq_array[0][$i], $resolution); } push @compared_sequence,$resolution; } elsif($seq_array[0][$i] =~ /!/ && $seq_array[1][$i] =~ /!/){ push @compared_sequence, add_bases($seq_array[0][$i],$seq_array[1][$i], $pair0, $pair1, $joint ); } else{ if($seq_array[0][$i] !~ /^$seq_array[1][$i]$/i){ push @compared_sequence, $pair0.":".$seq_array[0][$i]."!".$pair1.":".$seq_array[1][$i]; } else{ # print "perfect match\n" if $printer == 1; push @compared_sequence, $seq_array[0][$i]; } } } print "returning: comared = @compared_sequence \nand substitutions list =\n", join("\n",@substitutions_list),"\n" if $printer == 1; return join(".",@compared_sequence), join(":", @substitutions_list) if scalar (@substitutions_list) > 0; return join(".",@compared_sequence), "" if scalar (@substitutions_list) == 0; } sub resolve_base{ my $printer = 0; print "IN SUBROUTUNE resolve_base.. information received = @_\n" if $printer == 1; my ($optional, $single, $singlesp, $arg) = @_; my @options=split(/!/,$optional); foreach my $option(@options) { $option=~s/[A-Z\(\) ,]+://g; if ($option =~ /$single/i){ print "option = $option , returning single: $single\n" if $printer == 1; return $single; } } print "returning ",$optional."!".$singlesp.":".$single. "\n" if $arg eq "keep" && $printer == 1; print "returning ",$singlesp.":".$single."!".$optional. "\n" if $arg eq "invert" && $printer == 1; return $optional."!".$singlesp.":".$single if $arg eq "keep"; return $singlesp.":".$single."!".$optional if $arg eq "invert"; } sub same_length{ my $printer = 0; my @locus = @_; my $temp = shift @locus; $temp=~s/-|,//g; foreach my $l (@locus){ $l=~s/-|,//g; return 0 if length($l) != length($temp); $temp = $l; } return 1; } sub treeStudy{ my $printer = 0; # print "template DEFINED.. received: @_\n" if defined %template; # print "only received = @_" if !defined %template; my $stopper = 0; if (!defined %template){ $stopper = 1; %template=(); print "tree decipherer = $tree_decipherer\n" if $printer == 1; my ( $template_ref, $keys_array)=load_allPossibleTrees($tree_decipherer, \%template); print "return = $template_ref and @{$keys_array}\n" if $printer == 1; foreach my $key (@$keys_array){ print "addding : $template_ref->{$key} for $key\n" if $printer == 1; $template{$key} = $template_ref->{$key}; } } for my $templet ( keys %template ) { # print "$templet => @{$template{$templet}}\n"; } <STDIN> if !defined %template; my $strict = 0; my $H = 0; my $Hchr = 1; my $Hstart = 2; my $Hend = 3; my $Hmotif = 4; my $Hmotiflen = 5; my $Hmicro = 6; my $Hstrand = 7; my $Hmicrolen = 8; my $Hinterpos = 9; my $Hrelativepos = 10; my $Hinter = 11; my $Hinterlen = 12; my $C = 13; my $Cchr = 14; my $Cstart = 15; my $Cend = 16; my $Cmotif = 17; my $Cmotiflen = 18; my $Cmicro = 19; my $Cstrand = 20; my $Cmicrolen = 21; my $Cinterpos = 22; my $Crelativepos = 23; my $Cinter = 24; my $Cinterlen = 25; my $O = 26; my $Ochr = 27; my $Ostart = 28; my $Oend = 29; my $Omotif = 30; my $Omotiflen = 31; my $Omicro = 32; my $Ostrand = 33; my $Omicrolen = 34; my $Ointerpos = 35; my $Orelativepos = 36; my $Ointer = 37; my $Ointerlen = 38; my $R = 39; my $Rchr = 40; my $Rstart = 41; my $Rend = 42; my $Rmotif = 43; my $Rmotiflen = 44; my $Rmicro = 45; my $Rstrand = 46; my $Rmicrolen = 47; my $Rinterpos = 48; my $Rrelativepos = 49; my $Rinter = 50; my $Rinterlen = 51; my $Mchr = 52; my $Mstart = 53; my $Mend = 54; my $M = 55; my $Mmotif = 56; my $Mmotiflen = 57; my $Mmicro = 58; my $Mstrand = 59; my $Mmicrolen = 60; my $Minterpos = 61; my $Mrelativepos = 62; my $Minter = 63; my $Minterlen = 64; #-------------------------------------------------------------------------------# my @analysis=(); my %speciesOrder = (); $speciesOrder{"H"} = 0; $speciesOrder{"C"} = 1; $speciesOrder{"O"} = 2; $speciesOrder{"R"} = 3; $speciesOrder{"M"} = 4; #-------------------------------------------------------------------------------# my $line = $_[0]; chomp $line; my @f = split(/\t/,$line); print "received array : @f.. recieved tags = @tags\n" if $printer == 1; # collect all motifs my @motifs=(); @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif], $f[$Mmotif]) if $tags[$#tags] =~ /M/; @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]) if $tags[$#tags] =~ /R/; @motifs = ($f[$Hmotif], $f[$Cmotif], $f[$Omotif]) if $tags[$#tags] =~ /O/; # print "motifs in the array = $f[$Hmotif], $f[$Cmotif], $f[$Omotif], $f[$Rmotif]\n" if $tags[$#tags] =~ /R/;; print "motifs = @motifs\n" if $printer == 1; my @translation = (); foreach my $motif (@motifs){ push(@translation, "_") if $motif eq "NA"; push(@translation, "+") if $motif ne "NA"; } my $translate = join(" ", @translation); # print "translate = >$translate< and analysis = $template{$translate}[0].. on the other hand, ",$template{"- - +"}[0],"\n"; my @analyses = split(/\|/,$template{$translate}[0]); print "motifs = @motifs, analyses = @analyses\n" if $printer == 1; if (scalar(@analyses) == 1) { #print "analysis = $analyses[0]\n"; if ($analyses[0] !~ /,|\./ ){ if ($analyses[0] =~ /\+/){ my $analysis = $analyses[0]; $analysis =~ s/\+|\-//g; my @species = split(/\s*/,$analysis); my @currentMotifs = (); foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;} print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]++ if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]++ if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } else{ my $analysis = $analyses[0]; $analysis =~ s/\+|\-//g; my @species = split(/\s*/,$analysis); my @currentMotifs = (); my @complementarySpecies = (); my $allSpecies = join("",@tags); foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); print "pushing into currentMotifs: $speciesOrder{$specie}: $motifs[$speciesOrder{$specie}]\n" if $printer == 1;;} print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } } elsif ($analyses[0] =~ /,/) { my @events = split(/,/,$analyses[0]); print "events = @events \n " if $printer == 1; if ($events[0] =~ /\+/){ my $analysis1 = $events[0]; $analysis1 =~ s/\+|\-//g; my $analysis2 = $events[1]; $analysis2 =~ s/\+|\-//g; my @nSpecies = split(/\s*/,$analysis2); print "original anslysis = $analysis1 " if $printer == 1; foreach my $specie (@nSpecies){ $analysis1=~ s/$specie//g;} print "processed anslysis = $analysis1 \n" if $printer == 1; my @currentMotifs = (); foreach my $specie (split(/\s*/,$analysis1)){push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } else{ my $analysis1 = $events[0]; $analysis1 =~ s/\+|\-//g; my $analysis2 = $events[1]; $analysis2 =~ s/\+|\-//g; my @pSpecies = split(/\s*/,$analysis2); my @currentMotifs = (); foreach my $specie (@pSpecies){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } } elsif ($analyses[0] =~ /\./) { my @events = split(/\./,$analyses[0]); foreach my $event (@events){ print "event = $event \n" if $printer == 1; if ($event =~ /\+/){ my $analysis = $event; $analysis =~ s/\+|\-//g; my @species = split(/\s*/,$analysis); my @currentMotifs = (); foreach my $specie (@species){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } #print consistency(@currentMotifs),"<- \n"; print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } else{ my $analysis = $event; $analysis =~ s/\+|\-//g; my @species = split(/\s*/,$analysis); my @currentMotifs = (); my @complementarySpecies = (); my $allSpecies = join("",@tags); foreach my $specie (@species){ $allSpecies =~ s/$specie//g; } foreach my $specie (split(/\s*/,$allSpecies)){ push(@currentMotifs, $motifs[$speciesOrder{$specie}]); } #print consistency(@currentMotifs),"<- \n"; print "current motifs = @currentMotifs and consistency? ", (consistency(@currentMotifs))," \n" if $printer == 1; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 1 && consistency(@currentMotifs) ne "NULL"; $template{$translate}[1]=$template{$translate}[1]+1 if $strict == 0; print "adding to template $translate: $template{$translate}[1]\n" if $printer == 1; } } } } else{ my $finalanalysis = (); $template{$translate}[1]++; foreach my $analysis (@analyses){ ;} } # test if motifs where microsats are present, as indeed of same the motif composition for my $templet ( keys %template ) { # print "now returning: $templet\n"; if (@{ $template{$templet} }[1] > 0){ print "returning in the end: $templet and $translate\n" if $printer == 1; $template{$templet}[1] = 0; return (@{$template{$templet}}[0], $translate); } } undef %template; print "sending NULL\n" if $printer == 1; return ("NULL", $translate); } sub consistency{ my @motifs = @_; print "in consistency \n" if $printer == 1; print "motifs sent = >",join("|",@motifs),"< \n" if $printer == 1; return $motifs[0] if scalar(@motifs) == 1; my $prevmotif = shift(@motifs); my $stopper = 0; for my $i (0 ... $#motifs){ next if $motifs[$i] eq "NA"; my $templet = $motifs[$i].$motifs[$i]; if ($templet !~ /$prevmotif/i){ $stopper = 1; last; } } return $prevmotif if $stopper == 0; return "NULL" if $stopper == 1; } sub summarize_microsat{ my $printer = 0; my $line = $_[0]; my $humseq = $_[1]; my @gaps = $line =~ /[0-9]+\t[0-9]+\t[\+\-]/g; my @starts = $line =~ /[0-9]+\t[\+\-]/g; my @ends = $line =~ /[\+\-]\t[0-9]+/g; print "starts = @starts\tends = @ends\n" if $printer == 1; for my $i (0 ... $#gaps) {$gaps[$i] =~ s/\t[0-9]+\t[\+\-]//g;} for my $i (0 ... $#starts) {$starts[$i] =~ s/\t[\+\-]//g;} for my $i (0 ... $#ends) {$ends[$i] =~ s/[\+\-]\t//g;} my $minstart = array_smallest_number(@starts); my $maxend = array_largest_number(@ends); my $humupstream_st = substr($humseq, 0, $minstart); my $humupstream_en = substr($humseq, 0, $maxend); my $no_of_gaps_to_start = 0; my $no_of_gaps_to_end = 0; $no_of_gaps_to_start = ($humupstream_st =~ s/\-/x/g) if $humupstream_st=~/\-/; $no_of_gaps_to_end = ($humupstream_en =~ s/\-/x/g) if $humupstream_en=~/\-/; my $locusmotif = (); print "IN SUB SUMMARIZE_MICROSAT $line\n" if $printer == 1; #return "NULL" if $line =~ /compound/; my $Hstart = "NA"; my $Hend = "NA"; chomp $line; my $match_count = ($line =~ s/>/>/g); #print "number of species = $match_count\n"; my @micros = split(/>/,$line); shift @micros; my $stopper = 0; foreach my $mic (@micros){ my @local = split(/\t/,$mic); if ($local[$microsatcord] =~ /N/) {$stopper =1; last;} } return "NULL" if $stopper ==1; #------------------------------------------------------ my @arranged = (); for my $arr (0 ... $#exacttags) {$arranged[$arr] = '0';} foreach my $micro (@micros){ for my $i (0 ... $#exacttags){ if ($micro =~ /^$exacttags[$i]/){ $arranged[$i] = $micro; last; } } } # print "arranged = @arranged \n" ; <STDIN>;; my @endstatement = (); my $turn = 0; my $species_counter = 0; # print scalar(@arranged),"\n"; my $species_no=0; my $orthHchr = 0; foreach my $micro (@arranged) { $micro =~ s/\t\t/\t \t/g; $micro =~ s/\t,/\t ,/g; $micro =~ s/,\t/, \t/g; print "------------------------------------------------------------------------------------------\n" if $printer == 1; chomp $micro; if ($micro eq '0'){ push(@endstatement, join("\t",$exacttags[$species_counter],"NA","NA","NA","NA",0 ,"NA", "NA", 0,"NA","NA","NA", "NA" )); $species_counter++; print join("|","ENDSTATEMENT:",@endstatement),"\n" if $printer == 1; next; } # print $micro,"\n"; print "micro = $micro \n" if $printer == 1; my @fields = split(/\t/,$micro); my $microcopy = $fields[$microsatcord]; $microcopy =~ s/\[|\]|-//g; my $microsatlength = length($microcopy); print "microsat = $fields[$microsatcord] and microsatlength = $microsatlength\n" if $printer == 1; # print "sp_ident = @sp_ident.. species_no=$species_no\n"; $micro =~ /$sp_ident[$species_no]\s(\S+)\s([0-9]+)\s([0-9]+)/; my $sp_chr=$1; my $sp_start=$2 + $fields[$startcord] - $fields[$gapcord]; my $sp_end= $sp_start + $microsatlength - 1; $species_no++; $micro =~ /$focalspec\s(\S+)\s([0-9]+)\s([0-9]+)/; $orthHchr=$1; $Hstart=$2+$minstart-$no_of_gaps_to_start; $Hend=$2+$maxend-$no_of_gaps_to_end; print "Hstart = $Hstart = $fields[4] + $fields[$startcord] - $fields[$gapcord]\n" if $printer == 1; my $motif = $fields[$motifcord]; my $firstmotif = (); my $strand = $fields[$strandcord]; # print "strand = $strand\n"; if ($motif =~ /^\[/){ $motif =~ s/^\[//g; $motif =~ /([a-zA-Z]+)\].*/; $firstmotif = $1; } else {$firstmotif = $motif;} print "firstmotif =$firstmotif : \n" if $printer == 1; $firstmotif = allCaps($firstmotif); if (exists $revHash{$firstmotif} && $turn == 0) { $turn=1 if $species_counter==0; $firstmotif = $revHash{$firstmotif}; } elsif (exists $revHash{$firstmotif} && $turn == 1) {$firstmotif = $revHash{$firstmotif}; $turn = 1;} print "changed firstmotif =$firstmotif\n" if $printer == 1; # <STDIN>; $locusmotif = $firstmotif; if (scalar(@fields) > $microsatcord + 2){ print "fields = @fields ... interr_poscord=$interr_poscord=$fields[$interr_poscord] .. interrcord=$interrcord=$fields[$interrcord]\n" if $printer == 1; my @interposes = (); @interposes = split(",",$fields[$interr_poscord]) if $fields[$interr_poscord] =~ /,/; $interposes[0] = $fields[$interr_poscord] if $fields[$interr_poscord] !~ /,/ ; print "interposes=@interposes\n" if $printer == 1; my @relativeposes = (); my @interruptions = (); @interruptions = split(",",$fields[$interrcord]) if $fields[$interrcord] =~ /,/; $interruptions[0] = $fields[$interrcord] if $fields[$interrcord] !~ /,/; my @interlens = (); for my $i (0 ... $#interposes){ my $interpos = $interposes[$i]; my $nexter = 0; my $interruption = $interruptions[$i]; my $interlen = length($interruption); push (@interlens, $interlen); my $relativepos = (100 * $interpos) / $microsatlength; print "relativepos = $relativepos ,interpos=$interpos, interruption=$interruption, interlen=$interlen \n" if $printer == 1; $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; print "--> = $relativepos\n" if $printer == 1; $interruption = "IND" if length($interruption) < 1; if ($turn == 1){ $fields[$microsatcord] = switch_micro($fields[$microsatcord]); $interruption = switch_nucl($interruption) unless $interruption eq "IND"; $interpos = ($microsatlength - $interpos) - $interlen + 2; print "turn interpos = $interpos for $fields[$microsatcord]\n" if $printer == 1; $relativepos = (100 * $interpos) / $microsatlength; $relativepos = (100 * ($interpos-$interlen)) / $microsatlength if $relativepos > 50; $strand = '+' if $strand eq '-'; $strand = '-' if $strand eq '+'; } print "final relativepos = $relativepos\n" if $printer == 1; push(@relativeposes, $relativepos); } push(@endstatement,join("\t",($exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,join(",",@interposes),join(",",@relativeposes),join(",",@interruptions), join(",",@interlens)))); } else{ push(@endstatement, join("\t",$exacttags[$species_counter],$sp_chr, $sp_start, $sp_end, $firstmotif,length($firstmotif),$fields[$microsatcord],$strand,$microsatlength,"NA","NA","NA", "NA")); } $species_counter++; } $locusmotif = $sameHash{$locusmotif} if exists $sameHash{$locusmotif}; $locusmotif = $revHash{$locusmotif} if exists $revHash{$locusmotif}; my $endst = join("\t", @endstatement, $orthHchr, $Hstart, $Hend); print join("\t", @endstatement, $orthHchr, $Hstart, $Hend), "\n" if $printer == 1; return (join("\t", @endstatement, $orthHchr, $Hstart, $Hend), $orthHchr, $Hstart, $Hend, $locusmotif, length($locusmotif)); } sub switch_nucl{ my @strand = split(/\s*/,$_[0]); for my $i (0 ... $#strand){ if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} } return join("",@strand); } sub switch_micro{ my $micro = reverse($_[0]); my @strand = split(/\s*/,$micro); for my $i (0 ... $#strand){ if ($strand[$i] =~ /c/i) {$strand[$i] = "G";next;} if ($strand[$i] =~ /a/i) {$strand[$i] = "T";next;} if ($strand[$i] =~ /t/i) { $strand[$i] = "A";next;} if ($strand[$i] =~ /g/i) {$strand[$i] = "C";next;} if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} } return join("",@strand); } sub decipher_history{ my $printer = 0; my ($mutations_array, $tags_string, $nodes, $branches_hash, $tree_analysis, $confirmation_string, $alivehash) = @_; my %mutations_hash=(); foreach my $mutation (@$mutations_array){ print "mutation = $mutation\n" if $printer == 1; my %local = $mutation =~ /([\S ]+)=([\S ]+)/g; push @{$mutations_hash{$local{"node"}}},$mutation; print "just for confirmation: $local{node} pushed as: $mutation\n" if $printer == 1; } my @nodes; my @birth_steps=(); my @death_steps=(); my @tags=split(/\s*/,$tags_string); my @confirmation=split(/\s+/,$confirmation_string); my %info=(); for my $i (0 ... $#tags){ $info{$tags[$i]}=$confirmation[$i]; print "feeding info: $tags[$i] = $info{$tags[$i]}\n" if $printer == 1; } for my $keys (@$nodes) { foreach my $key (@$keys){ # print "current key = $key\n"; my $copykey = $key; $copykey =~ s/[\W ]+//g; my @copykeys=split(/\s*/,$copykey); my $states=(); foreach my $copy (@copykeys){ $states=$states.$info{$copy}; } print "reduced key = $copykey and state = $states\n" if $printer == 1; if (exists $mutations_hash{$key}) { if ($states=~/\+/){ push @birth_steps, @{$mutations_hash{$key}}; $birth_steps[$#birth_steps] =~ s/\S+=//g; delete $mutations_hash{$key}; } else{ push @death_steps, @{$mutations_hash{$key}}; $death_steps[$#death_steps] =~ s/\S+=//g; delete $mutations_hash{$key}; } } } } print "conformation = $confirmation_string\n" if $printer == 1; push (@birth_steps, "NULL") if scalar(@birth_steps) == 0; push (@death_steps, "NULL") if scalar(@death_steps) == 0; print "birth steps = ",join("\n",@birth_steps)," and death steps = ",join("\n",@death_steps),"\n" if $printer == 1; return \@birth_steps, \@death_steps; } sub fillAlignmentGaps{ my $printer = 0; print "received: @_\n" if $printer == 1; my ($tree, $sequences, $alignment, $tagarray, $microsathash, $nonmicrosathash, $motif, $tree_analysis, $threshold, $microsatstarts) = @_; print "in fillAlignmentGaps.. tree = $tree \n" if $printer == 1; my %sequence_hash=(); my @phases = (); my $concat = $motif.$motif; my $motifsize = length($motif); for my $i (1 ... $motifsize){ push @phases, substr($concat, $i, $motifsize); } my $concatalignment = (); foreach my $tag (@tags){ $concatalignment = $concatalignment.$alignment->{$tag}; } # print "returningg NULL","NULL","NULL", "NULL\n" if $concatalignment !~ /-/; return 0, "NULL","NULL","NULL", "NULL","NULL" if $concatalignment !~ /-/; my %node_sequences_temp=(); my %node_alignments_temp =(); #NEW, Nov 28 2008 my @tags=(); my @locus_sequences=(); my %alivehash=(); # print "IN fillAlignmentGaps\n";# <STDIN>; my %fillrecord = (); my $change = 0; foreach my $tag (@$tagarray) { #print "adding: $tag\n"; push(@tags, $tag); if (exists $microsathash->{$tag}){ my $micro = $microsathash->{$tag}; my $orig_micro = $micro; ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); $change = 1 if uc($micro) ne uc($orig_micro); $node_sequences_temp{$tag}=$micro if $microsathash->{$tag} ne "NULL"; } if (exists $nonmicrosathash->{$tag}){ my $micro = $nonmicrosathash->{$tag}; my $orig_micro = $micro; ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); $change = 1 if uc($micro) ne uc($orig_micro); $node_sequences_temp{$tag}=$micro if $nonmicrosathash->{$tag} ne "NULL"; } if (exists $alignment->{$tag}){ my $micro = $alignment->{$tag}; my $orig_micro = $micro; ($micro, $fillrecord{$tag}) = fillgaps($micro, \@phases); $change = 1 if uc($micro) ne uc($orig_micro); $node_alignments_temp{$tag}=$micro if $alignment->{$tag} ne "NULL"; } #print "adding to node_sequences: $tag = ",$node_sequences_temp{$tag},"\n" if $printer == 1; #print "adding to node_alignments: $tag = ",$node_alignments_temp{$tag},"\n" if $printer == 1; } my %node_sequences=(); my %node_alignments =(); #NEW, Nov 28 2008 foreach my $tag (@$tagarray) { $node_sequences{$tag} = join ".",split(/\s*/,$node_sequences_temp{$tag}); $node_alignments{$tag} = join ".",split(/\s*/,$node_alignments_temp{$tag}); } print "\n", "#" x 50, "\n" if $printer == 1; foreach my $tag (@tags){ print "$tag: $alignment->{$tag} = $node_alignments{$tag}\n" if $printer == 1; } print "\n", "#" x 50, "\n" if $printer == 1; # print "change = $change\n"; #<STDIN> if $concatalignment=~/\-/; # <STDIN> if $printer == 1 && $concatalignment =~ /\-/; return 0, "NULL","NULL","NULL", "NULL", "NULL" if $change == 0; my ($nodes_arr, $branches_hash) = get_nodes($tree); my @nodes=@$nodes_arr; print "recieved nodes = @nodes\n" if $printer == 1; #POPULATE branches_hash WITH INFORMATION ABOUT LIVESTATUS foreach my $keys (@nodes){ my @pair = @$keys; my $joint = "(".join(", ",@pair).")"; my $copykey = join "", @pair; $copykey =~ s/[\W ]+//g; print "for node: $keys, copykey = $copykey and joint = $joint\n" if $printer == 1; my $livestatus = 1; foreach my $copy (split(/\s*/,$copykey)){ $livestatus = 0 if !exists $alivehash{$copy}; } $alivehash{$joint} = $joint if !exists $alivehash{$joint} && $livestatus == 1; print "alivehash = $alivehash{$joint}\n" if exists $alivehash{$joint} && $printer == 1; } @nodes = reverse(@nodes); #1 THIS IS IN ORDER TO GO THROUGH THE TREE FROM LEAVES TO ROOT. my @mutations_array=(); my $joint = (); foreach my $node (@nodes){ my @pair = @$node; print "now in the nodes for loop, pair = @pair\n and sequences=\n" if $printer == 1; $joint = "(".join(", ",@pair).")"; print "joint = $joint \n" if $printer == 1; my @pair_sequences=(); foreach my $tag (@pair){ print "tag = $tag: " if $printer == 1; print $node_alignments{$tag},"\n" if $printer == 1; push @pair_sequences, $node_alignments{$tag}; } # print "fillgap\n"; my ($compared, $substitutions_list) = base_by_base_simple($motif,\@pair_sequences, scalar(@pair_sequences), @pair, $joint); $node_alignments{$joint}=$compared; push( @mutations_array,split(/:/,$substitutions_list)); print "newly added to node_sequences: $node_alignments{$joint} and list of mutations = @mutations_array\n" if $printer == 1; } print "now sending for analyze_mutations: mutation_array=@mutations_array, nodes=@nodes, branches_hash=$branches_hash, alignment=$alignment, tags=@tags, alivehash=%alivehash, node_sequences=\%node_sequences, microsatstarts=$microsatstarts, motif=$motif\n" if $printer == 1; # <STDIN> if $printer == 1; my $analayzed_mutations = analyze_mutations(\@mutations_array, \@nodes, $branches_hash, $alignment, \@tags, \%alivehash, \%node_sequences, $microsatstarts, $motif); # print "returningt: ", $analayzed_mutations, \@nodes,"\n" if scalar @mutations_array > 0;; # print "returningy: NULL, NULL, NULL " if scalar @mutations_array == 0 && $printer == 1; print "final node alignment after filling for $joint= " if $printer == 1; print "$node_alignments{$joint}\n" if $printer == 1; return 1, $analayzed_mutations, \@nodes, $branches_hash, \%alivehash, $node_alignments{$joint} if scalar @mutations_array > 0 ; return 1, "NULL","NULL","NULL", "NULL", "NULL" if scalar @mutations_array == 0; } sub add_mutation{ my $printer = 0; print "IN SUBROUTUNE add_mutation.. information received = @_\n" if $printer == 1; my ($i , $bite, $to, $from) = @_; print "bite = $bite.. all received info = ",join("^", @_),"\n" if $printer == 1; print "to=$to\n" if $printer == 1; print "tis split = ",join(" and ",split(/!/,$to)),"\n" if $printer == 1; my @toields = split "!",$to; print "toilds = @toields\n" if $printer == 1; my @mutations=(); foreach my $toield (@toields){ my @toinfo=split(":",$toield); print " at toinfo=@toinfo \n" if $printer == 1; next if $toinfo[1] =~ /$from/i; my @mutation = @toinfo if $toinfo[1] !~ /$from/i; print "adding to mutaton list: ", join(",", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion="),"\n" if $printer == 1; push (@mutations, join("\t", "node=$mutation[0]","type=substitution" ,"position=$i", "from=$from", "to=$mutation[1]", "insertion=", "deletion=")); } return @mutations; } sub add_bases{ my $printer = 0; print "IN SUBROUTUNE add_bases.. information received = @_\n" if $printer == 1; my ($optional0, $optional1, $pair0, $pair1,$joint) = @_; my $total_list=(); my @total_list0=split(/!/,$optional0); my @total_list1=split(/!/,$optional1); my @all_list=(); my %total_hash0=(); foreach my $entry (@total_list0) { $entry = uc $entry; $entry =~ /(\S+):(\S+)/; $total_hash0{$2}=$1; push @all_list, $2; } my %total_hash1=(); foreach my $entry (@total_list1) { $entry = uc $entry; $entry =~ /(\S+):(\S+)/; $total_hash1{$2}=$1; push @all_list, $2; } my %alphabetical_hash=(); my @return_options=(); for my $i (0 ... $#all_list){ my $alph = $all_list[$i]; if (exists $total_hash0{$alph} && exists $total_hash1{$alph}){ push(@return_options, $joint.":".$alph); delete $total_hash0{$alph}; delete $total_hash1{$alph}; } if (exists $total_hash0{$alph} && !exists $total_hash1{$alph}){ push(@return_options, $pair0.":".$alph); delete $total_hash0{$alph}; } if (!exists $total_hash0{$alph} && exists $total_hash1{$alph}){ push(@return_options, $pair1.":".$alph); delete $total_hash1{$alph}; } } print "returning ",join "!",@return_options,"\n" if $printer == 1; return join "!",@return_options; } sub fillgaps{ # print "IN fillgaps: @_\n"; my ($micro, $phasesinput) = @_; #print "in microsathash ,,.. micro = $micro\n"; return $micro if $micro !~ /\-/; my $orig_micro = $micro; my @phases = @$phasesinput; my %tested_patterns = (); foreach my $phase (@phases){ # print "considering phase: $phase\n"; my @phase_prefixes = (); my @prephase_left_contexts = (); my @prephase_right_contexts = (); my @pregapsize = (); my @prepostfilins = (); my @phase_suffixes; my @suffphase_left_contexts; my @suffphase_right_contexts; my @suffgapsize; my @suffpostfilins; my @postfilins = (); my $motifsize = length($phases[0]); my $change = 0; for my $u (0 ... $motifsize-1){ my $concat = $phase.$phase.$phase.$phase; my @concatarr = split(/\s*/, $concat); my $l = 0; while ($l < $u){ shift @concatarr; $l++; } $concat = join ("", @concatarr); for my $t (0 ... $motifsize-1){ for my $k (1 ... $motifsize-1){ push @phase_prefixes, substr($concat, $motifsize+$t, $k); push @prephase_left_contexts, substr ($concat, $t, $motifsize); push @prephase_right_contexts, substr ($concat, $motifsize+$t+$k+($motifsize-$k), 1); push @pregapsize, $k; push @prepostfilins, substr($concat, $motifsize+$t+$k, ($motifsize-$k)); # print "reading: $concat, t=$t, k=$k prefix: $prephase_left_contexts[$#prephase_left_contexts] $phase_prefixes[$#phase_prefixes] -x$pregapsize[$#pregapsize] $prephase_right_contexts[$#prephase_right_contexts]\n"; # print "phase_prefixes = $phase_prefixes[$#phase_prefixes]\n"; # print "prephase_left_contexts = $prephase_left_contexts[$#prephase_left_contexts]\n"; # print "prephase_right_contexts = $prephase_right_contexts[$#prephase_right_contexts]\n"; # print "pregapsize = $pregapsize[$#pregapsize]\n"; # print "prepostfilins = $prepostfilins[$#prepostfilins]\n"; } } } # print "looking if $micro =~ /($phase\-{$motifsize})/i || $micro =~ /^(\-{$motifsize,}$phase)/i\n"; if ($micro =~ /($phase\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,}$phase)/i){ # print "micro: $micro needs further gap removal: $1\n"; while ($micro =~ /$phase(\-{$motifsize,})$/i || $micro =~ /^(\-{$motifsize,})$phase/i){ # print "micro: $micro needs further gap removal: $1\n"; # print "phase being considered = $phase\n"; my $num = (); $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi if $micro =~ /$phase\-{$motifsize,}/i; $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi if $micro =~ /\-{$motifsize,}$phase/i; # print "num = $num\n"; $change = 1 if $num == 1; } } elsif ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ while ($micro =~ /(($phase)+)\-{$motifsize,}(($phase)+)/i){ # print "checking lengths of $1 and $3 for $micro... \n"; my $num = (); if (length($1) >= length($3)){ # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, >= , $3 \n"; $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; } if (length($1) < length($3)){ # print "$micro matches (($phase)+)\-{$motifsize,}(($phase)+) = $1, < , $3 \n"; $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; } # print "micro changed to $micro\n"; } } elsif ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ while ($micro =~ /([A-Z]+)\-{$motifsize,}(($phase)+)/i){ # print "$micro matches ([A-Z]+)\-{$motifsize}(($phase)+) = 1=$1, - , 3=$3 \n"; my $num = 0; $num = $micro =~ s/\-{$motifsize}$phase/$phase$phase/gi ; } } elsif ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ while ($micro =~ /(($phase)+)\-{$motifsize,}([A-Z]+)/i){ # print "$micro matches (($phase)+)\-{$motifsize,}([A-Z]+) = 1=$1, - , 3=$3 \n"; my $num = 0; $num = $micro =~ s/$phase\-{$motifsize}/$phase$phase/gi ; } } # print "$orig_micro to $micro\n"; #s <STDIN>; for my $h (0 ... $#phase_prefixes){ # print "searching using prefix : $prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]\n"; my $pattern = $prephase_left_contexts[$h].$phase_prefixes[$h].$pregapsize[$h].$prephase_right_contexts[$h]; # print "returning orig_micro = $orig_micro, micro = $micro \n" if exists $tested_patterns{$pattern}; if ($micro =~ /$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/i){ return $orig_micro if exists $tested_patterns{$pattern}; while ($micro =~ /($prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h])/i){ $tested_patterns{$pattern} = $pattern; # print "micro: $micro needs further gap removal: $1\n"; # print "prefix being considered = $phase_prefixes[$h]\n"; my $num = (); $num = ($micro =~ s/$prephase_left_contexts[$h]$phase_prefixes[$h]\-{$pregapsize[$h]}$prephase_right_contexts[$h]/$prephase_left_contexts[$h]$phase_prefixes[$h]$prepostfilins[$h]$prephase_right_contexts[$h]/gi) ; # print "num = $num, micro = $micro\n"; $change = 1 if $num == 1; return $orig_micro if $num > 1; } } } } return $orig_micro if length($micro) != length($orig_micro); return $micro; } sub selectMutationArray{ my $printer =0; my $oldmutspt = $_[0]; my $newmutspt = $_[1]; my $tagstringpt = $_[2]; my $alivehashpt = $_[3]; my $alignmentpt = $_[4]; my $motif = $_[5]; my @alivehasharr=(); my @tags = @$tagstringpt; my $alignmentln = length($alignmentpt->{$tags[0]}); foreach my $key (keys %$alivehashpt) { push @alivehasharr, $key; print "we have alive: $key\n" if $printer == 1;} my %newside = (); my %oldside = (); my %newmuts = (); my %commons = (); my %olds = (); foreach my $old (@$oldmutspt){ $olds{$old} = 1; } foreach my $new (@$newmutspt){ $commons{$new} = 1 if exists $olds{$new};; } foreach my $pos ( 0 ... $alignmentln){ #print "pos = $pos\n" if $printer == 1; my $newyes = 0; foreach my $mut (@$newmutspt){ $newmuts{$mut} = 1; chomp $mut; $newyes++; $mut =~ s/=\t/= \t/g; $mut =~ s/=$/= /g; $mut =~ /node=([A-Z\(\), ]+)\stype=([a-zA-Z ]+)\sposition=([0-9 ]+)\sfrom=([a-zA-Z\- ]+)\sto=([a-zA-Z\- ]+)\sinsertion=([a-zA-Z\- ]+)\sdeletion=([a-zA-Z\- ]+)/; my $node = $1; next if $3 != $pos; print "new mut = $mut\n" if $printer == 1; print "node = $node, pos = $3 ... and alivehasharr = >@alivehasharr<\n" if $printer == 1; my $alivenode = 0; foreach my $key (@alivehasharr){ $alivenode = 1 if $key =~ /$node/; } # next if $alivenode == 0; my $indel_type = " "; if ($2 eq "insertion" || $2 eq "deletion"){ my $thisindel = (); $thisindel = $6 if $2 eq "insertion"; $thisindel = $7 if $2 eq "deletion"; $indel_type = "i".checkIndelType($node, $thisindel, $motif,$alignmentpt,$3, $2) if $2 eq "insertion"; $indel_type = "d".checkIndelType($node, $thisindel, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; $indel_type = $indel_type."f" if $indel_type =~ /mot/ && length($thisindel) >= length($motif); } print "indeltype = $indel_type\n" if $printer == 1; my $added = 0; if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ print "we have a preexisting one for $pos\n" if $printer == 1; my @preexisting = @{$newside{$pos}}; foreach my $pre (@preexisting){ print "looking at $pre\n" if $printer == 1; next if $pre !~ /node=$node/; next if $pre !~ /indeltype=([a-z]+)/; my $currtype = $1; if ($currtype =~ /inon/ && $indel_type =~ /dmot/){ delete $newside{$pos}; push @{$newside{$pos}}, $pre; $added = 1; } if ($currtype =~ /dnon/ && $indel_type =~ /imot/){ delete $newside{$pos}; push @{$newside{$pos}}, $pre; $added = 1; } if ($currtype =~ /dmot/ && $indel_type =~ /inon/){ delete $newside{$pos}; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; $added = 1; } if ($currtype =~ /imot/ && $indel_type =~ /dnon/){ delete $newside{$pos}; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; $added = 1; } } } print "added = $added\n" if $printer == 1; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type" if $added == 0; print "for new pos,: $pos we have: @{$newside{$pos}}\n " if $printer == 1; } } foreach my $pos ( 0 ... $alignmentln){ my $oldyes = 0; foreach my $mut (@$oldmutspt){ chomp $mut; $oldyes++; $mut =~ s/=\t/= \t/g; $mut =~ s/=$/= /g; $mut =~ /node=([A-Z\(\), ]+)\ttype=([a-zA-Z ]+)\tposition=([0-9 ]+)\tfrom=([a-zA-Z\- ]+)\tto=([a-zA-Z\- ]+)\tinsertion=([a-zA-Z\- ]+)\tdeletion=([a-zA-Z\- ]+)/; my $node = $1; next if $3 != $pos; print "old mut = $mut\n" if $printer == 1; my $alivenode = 0; foreach my $key (@alivehasharr){ $alivenode = 1 if $key =~ /$node/; } #next if $alivenode == 0; my $indel_type = " "; if ($2 eq "insertion" || $2 eq "deletion"){ $indel_type = "i".checkIndelType($node, $6, $motif,$alignmentpt, $3, $2) if $2 eq "insertion"; $indel_type = "d".checkIndelType($node, $7, $motif,$alignmentpt, $3, $2) if $2 eq "deletion"; next if $indel_type =~/non/; } else{ next;} my $imp=0; $imp = 1 if $indel_type =~ /dmot/ && $alivenode == 0; $imp = 1 if $indel_type =~ /imot/ && $alivenode == 1; if (exists $newside{$pos} && $indel_type =~ /[a-z]+/){ my @preexisting = @{$newside{$pos}}; print "we have a preexisting one for $pos: @preexisting\n" if $printer == 1; next if $imp == 0; if (scalar(@preexisting) == 1){ my $foundmut = $preexisting[0]; $foundmut=~ /node=([A-Z, \(\)]+)/; next if $1 eq $node; if (exists $oldside{$pos} || exists $commons{$foundmut}){ print "not replacing, but just adding\n" if $printer == 1; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; next; } delete $newside{$pos}; push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; print "now new one is : @{$newside{$pos}}\n" if $printer == 1; } print "for pos: $pos: @{$newside{$pos}}\n" if $printer == 1; next; } my @news = @{$newside{$pos}} if exists $newside{$pos}; print "mut = $mut and news = @news\n" if $printer == 1; push @{$oldside{$pos}}, $mut."\tindeltype=$indel_type"; push @{$newside{$pos}}, $mut."\tindeltype=$indel_type"; } } print "in the end, our collected mutations = \n" if $printer == 1; my @returnarr = (); foreach my $key (keys %newside) {push @returnarr,@{$newside{$key}};} print join("\n", @returnarr),"\n" if $printer == 1; #<STDIN>; return @returnarr; } sub checkIndelType{ my $printer = 0; my $node = $_[0]; my $indel = $_[1]; my $motif = $_[2]; my $alignmentpt = $_[3]; my $posit = $_[4]; my $type = $_[5]; my @phases =(); my %prephases = (); my %postphases = (); #print "motif = $motif\n"; print "IN checkIndelType ... received: @_\n" if $printer == 1; my $concat = $motif.$motif.$motif.$motif; my $motiflength = length($motif); if ($motiflength > length ($indel)){ return "non" if $motif !~ /$indel/i; return checkIndelType_ComplexAnalysis($node, $indel, $motif, $alignmentpt, $posit, $type); } my $firstpass = 0; for my $y (0 ... $motiflength-1){ my $phase = substr($concat, $motiflength+$y, $motiflength); push @phases, $phase; $firstpass = 1 if $indel =~ /$phase/i; for my $k (0 ... length($motif)-1){ print "at: motiflength=$motiflength , y=$y , k=$k.. for pre: $motiflength+$y-$k and post: $motiflength+$y-$k+$motiflength in $concat\n" if $printer == 1; my $pre = substr($concat, $motiflength+$y-$k, $k ); my $post = substr($concat, $motiflength+$y+$motiflength, $k); print "adding to phases : $phase - $pre and $post\n" if $printer == 1; push @{$prephases{$phase}} , $pre; push @{$postphases{$phase}} , $post; } } print "firstpass 1= $firstpass\n" if $printer == 1; return "non" if $firstpass ==0; $firstpass =0; foreach my $phase (@phases){ my @pres = @{$prephases{$phase}}; my @posts = @{$postphases{$phase}}; foreach my $pre (@pres){ foreach my $post (@posts){ $firstpass = 1 if $indel =~ /($pre)?($phase)+($post)?/i && length($indel) > (3 * length($motif)); $firstpass = 1 if $indel =~ /^($pre)?($phase)+($post)?$/i && length($indel) < (3 * length($motif)); print "matched here : ($pre)?($phase)+($post)?\n" if $printer == 1; last if $firstpass == 1; } last if $firstpass == 1; } last if $firstpass == 1; } print "firstpass 2= $firstpass\n" if $printer == 1; return "non" if $firstpass ==0; return "mot" if $firstpass ==1; } sub checkIndelType_ComplexAnalysis{ my $printer = 0; my $node = $_[0]; my $indel = $_[1]; my $motif = $_[2]; my $alignmentpt = $_[3]; my $pos = $_[4]; my $type = $_[5]; my @speciesinvolved = $node =~ /[A-Z]+/g; my @seqs = (); my $residualseq = length($motif) - length($indel); print "IN COMPLEX ANALYSIS ... received: @_ .... speciesinvolved = @speciesinvolved\n" if $printer == 1; print "we have position = $pos, sseq = $alignmentpt->{$speciesinvolved[0]}\n" if $printer == 1; print "residualseq = $residualseq\n" if $printer == 1; print "pos=$pos... got: @_\n" if $printer == 1; foreach my $sp (@speciesinvolved){ my $spseq = $alignmentpt->{$sp}; #print "orig spseq = $spseq\n"; my $subseq = (); if ($type eq "deletion"){ my @indelparts = split(/\s*/,$indel); my @seqparts = split(/\s*/,$spseq); for my $p ($pos ... $pos+length($indel)-1){ $seqparts[$p] = shift @indelparts; } $spseq = join("",@seqparts); } #print "mod spseq = $spseq\n"; # $spseq=~ s/\-//g if $type !~ /deletion/; print "substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq)\n" if $pos > 0 && $pos < (length($spseq) - length($motif)) && $printer == 1; print "substr($spseq, 0, length($indel)+$residualseq)\n" if $pos == 0 && $printer == 1; print "substr($spseq, $pos - $residualseq, length($indel)+$residualseq)\n" if $pos >= (length($spseq) - length($motif)) && $printer == 1; $subseq = substr($spseq, $pos-($residualseq), length($indel)+$residualseq+$residualseq) if $pos > 0 && $pos < (length($spseq) - length($motif)) ; $subseq = substr($spseq, 0, length($indel)+$residualseq) if $pos == 0; $subseq = substr($spseq, $pos - $residualseq, length($indel)+$residualseq) if $pos >= (length($spseq) - length($motif)) ; print "spseq = $spseq . subseq=$subseq . type = $type\n" if $printer == 1; #<STDIN> if $subseq !~ /[a-z\-]/i; $subseq =~ s/\-/$indel/g if $type =~ /insertion/; push @seqs, $subseq; print "seqs = @seqs\n" if $printer == 1; } return "non" if checkIfSeqsIdentical(@seqs) eq "NO"; print "checking for $seqs[0] \n" if $printer == 1; my @phases =(); my %prephases = (); my %postphases = (); my $concat = $motif.$motif.$motif.$motif; my $motiflength = length($motif); my $firstpass = 0; for my $y (0 ... $motiflength-1){ my $phase = substr($concat, $motiflength+$y, $motiflength); push @phases, $phase; $firstpass = 1 if $seqs[0] =~ /$phase/i; for my $k (0 ... length($motif)-1){ my $pre = substr($concat, $motiflength+$y-$k, $k ); my $post = substr($concat, $motiflength+$y+$motiflength, $k); print "adding to phases : $phase - $pre and $post\n" if $printer == 1; push @{$prephases{$phase}} , $pre; push @{$postphases{$phase}} , $post; } } print "firstpass 1= $firstpass.. also, res-d = ",(length($seqs[0]))%(length($motif)),"\n" if $printer == 1; return "non" if $firstpass ==0; $firstpass =0; foreach my $phase (@phases){ $firstpass = 1 if $seqs[0] =~ /^($phase)+$/i && ((length($seqs[0]))%(length($motif))) == 0; if (((length($seqs[0]))%(length($motif))) != 0){ my @pres = @{$prephases{$phase}}; my @posts = @{$postphases{$phase}}; foreach my $pre (@pres){ foreach my $post (@posts){ next if $pre !~ /\S/ && $post !~ /\S/; $firstpass = 1 if ($seqs[0] =~ /^($pre)($phase)+($post)$/i || $seqs[0] =~ /^($pre)($phase)+$/i || $seqs[0] =~ /^($phase)+($post)$/i); print "caught with $pre $phase $post\n" if $printer == 1; last if $firstpass == 1; } last if $firstpass == 1; } } last if $firstpass == 1; } #print "indel = $indel.. motif = $motif.. firstpass 2= mot\n" if $firstpass ==1; #print "indel = $indel.. motif = $motif.. firstpass 2= non\n" if $firstpass ==0; #<STDIN>;# if $firstpass ==1; return "non" if $firstpass ==0; return "mot" if $firstpass ==1; } sub checkIfSeqsIdentical{ my @seqs = @_; my $identical = 1; for my $j (1 ... $#seqs){ $identical = 0 if uc($seqs[0]) ne uc($seqs[$j]); } return "NO" if $identical == 0; return "YES" if $identical == 1; } sub summarizeMutations{ my $mutspt = $_[0]; my @muts = @$mutspt; my $tree = $_[1]; my @returnarr = (); for (1 ... 38){ push @returnarr, "NA"; } push @returnarr, "NULL"; return @returnarr if $tree eq "NULL" || scalar(@muts) < 1; my @bspecies = (); my @dspecies = (); my $treecopy = $tree; $treecopy =~ s/[\(\)]//g; my @treeparts = split(/[\.,]+/, $treecopy); for my $part (@treeparts){ if ($part =~ /\+/){ $part =~ s/\+//g; #my @sp = split(/\s*/, $part); #foreach my $p (@sp) {push @bspecies, $p;} push @bspecies, $part; } if ($part =~ /\-/){ $part =~ s/\-//g; #my @sp = split(/\s*/, $part); #foreach my $p (@sp) {push @dspecies, $p;} push @dspecies, $part; } } #print "-------------------------------------------------------\n"; my ($insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) = (0,0,0,0,0,0,0,0); my ($binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) = (0,0,0,0,0,0,0,0); my ($dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) = (0,0,0,0,0,0,0,0); my ($ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) = (0,0,0,0,0,0,0,0); my ($substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) = (0,0,0,0,"NA","NA"); my @insertionsarr = (" "); my @deletionsarr = (" "); my @substitutionsarr = (" "); foreach my $mut (@muts){ # print "mut = $mut\n"; chomp $mut; $mut =~ s/=\t/= /g; $mut =~ s/=$/= /g; my %mhash = (); my @mields = split(/\t/,$mut); foreach my $m (@mields){ my @fields = split(/=/,$m); next if $fields[1] eq " "; $mhash{$fields[0]} = $fields[1]; } my $myutype = (); my $decided = 0; my $localnode = $mhash{"node"}; $localnode =~ s/[\(\)\. ,]//g; foreach my $s (@bspecies){ if ($localnode eq $s) { $decided = 1; $myutype = "b"; } } foreach my $s (@dspecies){ if ($localnode eq $s) { $decided = 1; $myutype = "d"; } } $myutype = "n" if $decided != 1; # print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. \n"; # <STDIN> if $mhash{"type"} eq "insertion" && $myutype eq "b"; if ($mhash{"type"} eq "substitution"){ $substitutions++; $bsubstitutions++ if $myutype eq "b"; $dsubstitutions++ if $myutype eq "d"; $nsubstitutions++ if $myutype eq "n"; # print "substitution: from= $mhash{from}, to = $mhash{to}, and type = myutype\n"; push @substitutionsarr, "b:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "b"; push @substitutionsarr, "d:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "d"; push @substitutionsarr, "n:$mhash{position}:".$mhash{"from"}.">".$mhash{"to"} if $myutype eq "n"; # print "substitutionsarr = @substitutionsarr\n"; # <STDIN>; } else{ #print "tree=$tree, birth species=@bspecies, death species=@dspecies, node=$mhash{node} .. myutype=$myutype .. indeltype=$mhash{indeltype}\n"; if ($mhash{"type"} eq "deletion"){ $deletions++; $motdeletions++ if $mhash{"indeltype"} =~ /dmot/; $motdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/; $nondeletions++ if $mhash{"indeltype"} =~ /dnon/; $bdeletions++ if $myutype eq "b"; $ddeletions++ if $myutype eq "d"; $ndeletions++ if $myutype eq "n"; $bmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "b"; $bmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "b"; $bnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "b"; $dmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "d"; $dmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "d"; $dnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "d"; $nmotdeletions++ if $mhash{"indeltype"} =~ /dmot/ && $myutype eq "n"; $nmotdeletionsf++ if $mhash{"indeltype"} =~ /dmotf/ && $myutype eq "n"; $nnondeletions++ if $mhash{"indeltype"} =~ /dnon/ && $myutype eq "n"; push @deletionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "b"; push @deletionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "d"; push @deletionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"deletion"} if $myutype eq "n"; } if ($mhash{"type"} eq "insertion"){ $insertions++; $motinsertions++ if $mhash{"indeltype"} =~ /imot/; $motinsertionsf++ if $mhash{"indeltype"} =~ /imotf/; $noninsertions++ if $mhash{"indeltype"} =~ /inon/; $binsertions++ if $myutype eq "b"; $dinsertions++ if $myutype eq "d"; $ninsertions++ if $myutype eq "n"; $bmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "b"; $bmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "b"; $bnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "b"; $dmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "d"; $dmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "d"; $dnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "d"; $nmotinsertions++ if $mhash{"indeltype"} =~ /imot/ && $myutype eq "n"; $nmotinsertionsf++ if $mhash{"indeltype"} =~ /imotf/ && $myutype eq "n"; $nnoninsertions++ if $mhash{"indeltype"} =~ /inon/ && $myutype eq "n"; push @insertionsarr, "b:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "b"; push @insertionsarr, "d:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "d"; push @insertionsarr, "n:$mhash{indeltype}:$mhash{position}:".$mhash{"insertion"} if $myutype eq "n"; } } } $indels = "ins=".join(",",@insertionsarr).";dels=".join(",",@deletionsarr) if scalar(@insertionsarr) > 1 || scalar(@deletionsarr) > 1 ; $subs = join(",",@substitutionsarr) if scalar(@substitutionsarr) > 1; $indels =~ s/ //g; $subs =~ s/ //g ; #print "indels = $indels, subs=$subs\n"; ##<STDIN> if $indels =~ /[a-zA-Z0-9]/ || $subs =~ /[a-zA-Z0-9]/ ; #print "tree = $tree, indels = $indels, subs = $subs, bspecies = @bspecies, dspecies = @dspecies \n"; my @returnarray = (); push (@returnarray, $insertions, $deletions, $motinsertions, $motinsertionsf, $motdeletions, $motdeletionsf, $noninsertions, $nondeletions) ; push (@returnarray, $binsertions, $bdeletions, $bmotinsertions,$bmotinsertionsf, $bmotdeletions, $bmotdeletionsf, $bnoninsertions, $bnondeletions) ; push (@returnarray, $dinsertions, $ddeletions, $dmotinsertions,$dmotinsertionsf, $dmotdeletions, $dmotdeletionsf, $dnoninsertions, $dnondeletions) ; push (@returnarray, $ninsertions, $ndeletions, $nmotinsertions,$nmotinsertionsf, $nmotdeletions, $nmotdeletionsf, $nnoninsertions, $nnondeletions) ; push (@returnarray, $substitutions, $bsubstitutions, $dsubstitutions, $nsubstitutions, $indels, $subs) ; push @returnarray, $tree; my @copy = @returnarray; return (@returnarray); } sub selectBetterTree{ my $printer = 0; my $treestudy = $_[0]; my $alt = $_[1]; my $mutspt = $_[2]; my @muts = @$mutspt; my @trees = (); my @alternatetrees=(); @trees = split(/\|/,$treestudy) if $treestudy =~ /\|/; @alternatetrees = split(/[\|;]/,$alt) if $alt =~ /[\|;\(\)]/; $trees[0] = $treestudy if $treestudy !~ /\|/; $alternatetrees[0] = $alt if $alt !~ /[\|;\(\)]/; my @alltrees = (@trees, @alternatetrees); # push(@alltrees,@alternatetrees); my %mutspecies = (); print "IN selectBetterTree..treestudy=$treestudy. alt=$alt. for: @_. trees=@trees<. alternatetrees=@alternatetrees\n" if $printer == 1; #<STDIN>; foreach my $mut (@muts){ print colored ['green'],"mut = $mut\n" if $printer == 1; $mut =~ /node=([A-Z,\(\) ]+)/; my $node = $1; $node =~s/[,\(\) ]+//g; my @indivspecies = $node =~ /[A-Z]+/g; #print "adding node: $node\n" if $printer == 1; $mutspecies{$node} = $node; #foreach (@indivspecies) { #$mutspecies{$mut} = $_; #print "for $_ adding $mutspecies{$_}\n"; #} } my @treerecords = (); my $treecount = -1; foreach my $tree (@alltrees){ print "checking with tree $tree\n" if $printer == 1; $treecount++; $treerecords[$treecount] = 0; my @indivspecies = ($tree =~ /[A-Z]+/g); print "indivspecies=@indivspecies\n" if $printer == 1; foreach my $species (@indivspecies){ print "checkin if exists species: $species\n" if $printer == 1; $treerecords[$treecount]+=2 if exists $mutspecies{$species} && $mutspecies{$species} !~ /indeltype=[a-z]mot/; $treerecords[$treecount]+=1.5 if exists $mutspecies{$species} && $mutspecies{$species} =~ /indeltype=[a-z]mot/; $treerecords[$treecount]-- if !exists $mutspecies{$species}; } print "for tree $tree, our treecount = $treerecords[$treecount]\n" if $printer == 1; } my @best_tree = array_largest_number_arrayPosition(@treerecords); print "treerecords = @treerecords. hence, best tree = @best_tree\n" if $printer == 1; return ($alltrees[$best_tree[0]], $treerecords[$best_tree[0]]) if scalar(@best_tree) == 1; print "best_tree[0] = $best_tree[0], and treerecords = $treerecords[$best_tree[0]]\n" if $printer == 1; return ("NULL", -1) if $treerecords[$best_tree[0]] < 1; my $rando = int(rand($#trees)); return ($alltrees[$rando], $treerecords[$rando]) if scalar(@best_tree) > 1; } sub load_sameHash{ #my $g = %$_[0]; $sameHash{"CAGT"}="AGTC"; $sameHash{"ATGA"}="AATG"; $sameHash{"CAAC"}="AACC"; $sameHash{"GGAA"}="AAGG"; $sameHash{"TAAG"}="AAGT"; $sameHash{"CGAG"}="AGCG"; $sameHash{"TAGG"}="AGGT"; $sameHash{"GCAG"}="AGGC"; $sameHash{"TAGA"}="ATAG"; $sameHash{"TGA"}="ATG"; $sameHash{"CAAG"}="AAGC"; $sameHash{"CTAA"}="AACT"; $sameHash{"CAAT"}="AATC"; $sameHash{"GTAG"}="AGGT"; $sameHash{"GAAG"}="AAGG"; $sameHash{"CGA"}="ACG"; $sameHash{"GTAA"}="AAGT"; $sameHash{"ACAA"}="AAAC"; $sameHash{"GCGG"}="GGGC"; $sameHash{"ATCA"}="AATC"; $sameHash{"TAAC"}="AACT"; $sameHash{"GGCA"}="AGGC"; $sameHash{"TGAG"}="AGTG"; $sameHash{"AACA"}="AAAC"; $sameHash{"GAGC"}="AGCG"; $sameHash{"ACCA"}="AACC"; $sameHash{"TGAA"}="AATG"; $sameHash{"ACA"}="AAC"; $sameHash{"GAAC"}="AACG"; $sameHash{"GCA"}="AGC"; $sameHash{"CCAC"}="ACCC"; $sameHash{"CATA"}="ATAC"; $sameHash{"CAC"}="ACC"; $sameHash{"TACA"}="ATAC"; $sameHash{"GGAC"}="ACGG"; $sameHash{"AGA"}="AAG"; $sameHash{"ATAA"}="AAAT"; $sameHash{"CA"}="AC"; $sameHash{"CCCA"}="ACCC"; $sameHash{"TCAA"}="AATC"; $sameHash{"CAGA"}="AGAC"; $sameHash{"AATA"}="AAAT"; $sameHash{"CCA"}="ACC"; $sameHash{"AGAA"}="AAAG"; $sameHash{"AGTA"}="AAGT"; $sameHash{"GACG"}="ACGG"; $sameHash{"TCAG"}="AGTC"; $sameHash{"ACGA"}="AACG"; $sameHash{"CGCA"}="ACGC"; $sameHash{"GAGT"}="AGTG"; $sameHash{"GA"}="AG"; $sameHash{"TA"}="AT"; $sameHash{"TAA"}="AAT"; $sameHash{"CAG"}="AGC"; $sameHash{"GATA"}="ATAG"; $sameHash{"GTA"}="AGT"; $sameHash{"CCAA"}="AACC"; $sameHash{"TAG"}="AGT"; $sameHash{"CAAA"}="AAAC"; $sameHash{"AAGA"}="AAAG"; $sameHash{"CACG"}="ACGC"; $sameHash{"GTCA"}="AGTC"; $sameHash{"GGA"}="AGG"; $sameHash{"GGAT"}="ATGG"; $sameHash{"CGGG"}="GGGC"; $sameHash{"CGGA"}="ACGG"; $sameHash{"AGGA"}="AAGG"; $sameHash{"TAAA"}="AAAT"; $sameHash{"GAGA"}="AGAG"; $sameHash{"ACTA"}="AACT"; $sameHash{"GCGA"}="AGCG"; $sameHash{"CACA"}="ACAC"; $sameHash{"AGAT"}="ATAG"; $sameHash{"GAGG"}="AGGG"; $sameHash{"CGAC"}="ACCG"; $sameHash{"GGAG"}="AGGG"; $sameHash{"GCCA"}="AGCC"; $sameHash{"CCAG"}="AGCC"; $sameHash{"GAAA"}="AAAG"; $sameHash{"CAGG"}="AGGC"; $sameHash{"GAC"}="ACG"; $sameHash{"CAA"}="AAC"; $sameHash{"GACC"}="ACCG"; $sameHash{"GGCG"}="GGGC"; $sameHash{"GGTA"}="AGGT"; $sameHash{"AGCA"}="AAGC"; $sameHash{"GATG"}="ATGG"; $sameHash{"GTGA"}="AGTG"; $sameHash{"ACAG"}="AGAC"; $sameHash{"CGG"}="GGC"; $sameHash{"ATA"}="AAT"; $sameHash{"GACA"}="AGAC"; $sameHash{"GCAA"}="AAGC"; $sameHash{"CAGC"}="AGCC"; $sameHash{"GGGA"}="AGGG"; $sameHash{"GAG"}="AGG"; $sameHash{"ACAT"}="ATAC"; $sameHash{"GAAT"}="AATG"; $sameHash{"CACC"}="ACCC"; $sameHash{"GAT"}="ATG"; $sameHash{"GCG"}="GGC"; $sameHash{"GCAC"}="ACGC"; $sameHash{"GAA"}="AAG"; $sameHash{"TGGA"}="ATGG"; $sameHash{"CCGA"}="ACCG"; $sameHash{"CGAA"}="AACG"; } sub load_revHash{ $revHash{"CTGA"}="AGTC"; $revHash{"TCTT"}="AAAG"; $revHash{"CTAG"}="AGCT"; $revHash{"GGTG"}="ACCC"; $revHash{"GCC"}="GGC"; $revHash{"GCTT"}="AAGC"; $revHash{"GCGT"}="ACGC"; $revHash{"GTTG"}="AACC"; $revHash{"CTCC"}="AGGG"; $revHash{"ATC"}="ATG"; $revHash{"CGAT"}="ATCG"; $revHash{"TTAA"}="AATT"; $revHash{"GTTC"}="AACG"; $revHash{"CTGC"}="AGGC"; $revHash{"TCGA"}="ATCG"; $revHash{"ATCT"}="ATAG"; $revHash{"GGTT"}="AACC"; $revHash{"CTTA"}="AAGT"; $revHash{"TGGC"}="AGCC"; $revHash{"CCG"}="GGC"; $revHash{"CGGC"}="GGCC"; $revHash{"TTAG"}="AACT"; $revHash{"GTG"}="ACC"; $revHash{"CTTT"}="AAAG"; $revHash{"TGCA"}="ATGC"; $revHash{"CGCT"}="AGCG"; $revHash{"TTCC"}="AAGG"; $revHash{"CT"}="AG"; $revHash{"C"}="G"; $revHash{"CTCT"}="AGAG"; $revHash{"ACTT"}="AAGT"; $revHash{"GGTC"}="ACCG"; $revHash{"ATTC"}="AATG"; $revHash{"GGGT"}="ACCC"; $revHash{"CCTA"}="AGGT"; $revHash{"CGCG"}="GCGC"; $revHash{"GTGT"}="ACAC"; $revHash{"GCCC"}="GGGC"; $revHash{"GTCG"}="ACCG"; $revHash{"TCCC"}="AGGG"; $revHash{"TTCA"}="AATG"; $revHash{"AGTT"}="AACT"; $revHash{"CCCT"}="AGGG"; $revHash{"CCGC"}="GGGC"; $revHash{"CTT"}="AAG"; $revHash{"TTGG"}="AACC"; $revHash{"ATT"}="AAT"; $revHash{"TAGC"}="AGCT"; $revHash{"ACTG"}="AGTC"; $revHash{"TCAC"}="AGTG"; $revHash{"CTGT"}="AGAC"; $revHash{"TGTG"}="ACAC"; $revHash{"ATCC"}="ATGG"; $revHash{"GTGG"}="ACCC"; $revHash{"TGGG"}="ACCC"; $revHash{"TCGG"}="ACCG"; $revHash{"CGGT"}="ACCG"; $revHash{"GCTC"}="AGCG"; $revHash{"TACG"}="ACGT"; $revHash{"GTTT"}="AAAC"; $revHash{"CAT"}="ATG"; $revHash{"CATG"}="ATGC"; $revHash{"GTTA"}="AACT"; $revHash{"CACT"}="AGTG"; $revHash{"TCAT"}="AATG"; $revHash{"TTA"}="AAT"; $revHash{"TGTA"}="ATAC"; $revHash{"TTTC"}="AAAG"; $revHash{"TACT"}="AAGT"; $revHash{"TGTT"}="AAAC"; $revHash{"CTA"}="AGT"; $revHash{"GACT"}="AGTC"; $revHash{"TTGC"}="AAGC"; $revHash{"TTC"}="AAG"; $revHash{"GCT"}="AGC"; $revHash{"GCAT"}="ATGC"; $revHash{"TGGT"}="AACC"; $revHash{"CCT"}="AGG"; $revHash{"CATC"}="ATGG"; $revHash{"CCAT"}="ATGG"; $revHash{"CCCG"}="GGGC"; $revHash{"TGCC"}="AGGC"; $revHash{"TG"}="AC"; $revHash{"TGCT"}="AAGC"; $revHash{"GCCG"}="GGCC"; $revHash{"TCTG"}="AGAC"; $revHash{"TGT"}="AAC"; $revHash{"TTAT"}="AAAT"; $revHash{"TAGT"}="AACT"; $revHash{"TATG"}="ATAC"; $revHash{"TTTA"}="AAAT"; $revHash{"CGTA"}="ACGT"; $revHash{"TA"}="AT"; $revHash{"TGTC"}="AGAC"; $revHash{"CTAT"}="ATAG"; $revHash{"TATA"}="ATAT"; $revHash{"TAC"}="AGT"; $revHash{"TC"}="AG"; $revHash{"CATT"}="AATG"; $revHash{"TCG"}="ACG"; $revHash{"ATTT"}="AAAT"; $revHash{"CGTG"}="ACGC"; $revHash{"CTG"}="AGC"; $revHash{"TCGT"}="AACG"; $revHash{"TCCG"}="ACGG"; $revHash{"GTT"}="AAC"; $revHash{"ATGT"}="ATAC"; $revHash{"CTTG"}="AAGC"; $revHash{"CCTT"}="AAGG"; $revHash{"GATC"}="ATCG"; $revHash{"CTGG"}="AGCC"; $revHash{"TTCT"}="AAAG"; $revHash{"CGTC"}="ACGG"; $revHash{"CG"}="GC"; $revHash{"TATT"}="AAAT"; $revHash{"CTCG"}="AGCG"; $revHash{"TCTC"}="AGAG"; $revHash{"TCCT"}="AAGG"; $revHash{"TGG"}="ACC"; $revHash{"ACTC"}="AGTG"; $revHash{"CTC"}="AGG"; $revHash{"CGC"}="GGC"; $revHash{"TTG"}="AAC"; $revHash{"ACCT"}="AGGT"; $revHash{"TCTA"}="ATAG"; $revHash{"GTAC"}="ACGT"; $revHash{"TTGA"}="AATC"; $revHash{"GTCC"}="ACGG"; $revHash{"GATT"}="AATC"; $revHash{"T"}="A"; $revHash{"CGTT"}="AACG"; $revHash{"GTC"}="ACG"; $revHash{"GCCT"}="AGGC"; $revHash{"TGC"}="AGC"; $revHash{"TTTG"}="AAAC"; $revHash{"GGCT"}="AGCC"; $revHash{"TCA"}="ATG"; $revHash{"GTGC"}="ACGC"; $revHash{"TGAT"}="AATC"; $revHash{"TAT"}="AAT"; $revHash{"CTAC"}="AGGT"; $revHash{"TGCG"}="ACGC"; $revHash{"CTCA"}="AGTG"; $revHash{"CTTC"}="AAGG"; $revHash{"GCTG"}="AGCC"; $revHash{"TATC"}="ATAG"; $revHash{"TAAT"}="AATT"; $revHash{"ACT"}="AGT"; $revHash{"TCGC"}="AGCG"; $revHash{"GGT"}="ACC"; $revHash{"TCC"}="AGG"; $revHash{"TTGT"}="AAAC"; $revHash{"TGAC"}="AGTC"; $revHash{"TTAC"}="AAGT"; $revHash{"CGT"}="ACG"; $revHash{"ATTA"}="AATT"; $revHash{"ATTG"}="AATC"; $revHash{"CCTC"}="AGGG"; $revHash{"CCGG"}="GGCC"; $revHash{"CCGT"}="ACGG"; $revHash{"TCCA"}="ATGG"; $revHash{"CGCC"}="GGGC"; $revHash{"GT"}="AC"; $revHash{"TTCG"}="AACG"; $revHash{"CCTG"}="AGGC"; $revHash{"TCT"}="AAG"; $revHash{"GTAT"}="ATAC"; $revHash{"GTCT"}="AGAC"; $revHash{"GCTA"}="AGCT"; $revHash{"TACC"}="AGGT"; } sub allCaps{ my $motif = $_[0]; $motif =~ s/a/A/g; $motif =~ s/c/C/g; $motif =~ s/t/T/g; $motif =~ s/g/G/g; return $motif; } sub all_caps{ my @strand = split(/\s*/,$_[0]); for my $i (0 ... $#strand){ if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} } return join("",@strand); } sub array_mean{ return "NA" if scalar(@_) == 0; my $sum = 0; foreach my $val (@_){ $sum = $sum + $val; } return ($sum/scalar(@_)); } sub array_sum{ return "NA" if scalar(@_) == 0; my $sum = 0; foreach my $val (@_){ $sum = $sum + $val; } return ($sum); } sub variance{ return "NA" if scalar(@_) == 0; return 0 if scalar(@_) == 1; my $mean = array_mean(@_); my $num = 0; return 0 if scalar(@_) == 1; # print "mean = $mean .. array = >@_<\n"; foreach my $ele (@_){ # print "$num = $num + ($ele-$mean)*($ele-$mean)\n"; $num = $num + ($ele-$mean)*($ele-$mean); } my $var = $num / scalar(@_); return $var; } sub array_95confIntervals{ return "NA" if scalar(@_) <= 0; my @sorted = sort { $a <=> $b } @_; # print "@sorted=",scalar(@sorted), "\n"; my $aDeechNo = int((scalar(@sorted) * 2.5) / 100); my $saaDeNo = int((scalar(@sorted) * 97.5) / 100); return ($sorted[$aDeechNo], $sorted[$saaDeNo]); } sub array_median{ return "NA" if scalar(@_) == 0; return $_[0] if scalar(@_) == 1; my @sorted = sort { $a <=> $b } @_; my $totalno = scalar(@sorted); #print "sorted = @sorted\n"; my $pick = (); if ($totalno % 2 == 1){ #print "odd set .. totalno = $totalno\n"; my $mid = $totalno / 2; my $onehalfno = $mid - $mid % 1; my $secondhalfno = $onehalfno + 1; my $onehalf = $sorted[$onehalfno-1]; my $secondhalf = $sorted[$secondhalfno-1]; #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; $pick = $secondhalf; } else{ #print "even set .. totalno = $totalno\n"; my $mid = $totalno / 2; my $onehalfno = $mid; my $secondhalfno = $onehalfno + 1; my $onehalf = $sorted[$onehalfno-1]; my $secondhalf = $sorted[$secondhalfno-1]; #print "onehalfno = $onehalfno and secondhalfno = $secondhalfno \n onehalf = $onehalf and secondhalf = $secondhalf\n"; $pick = ($onehalf + $secondhalf )/2; } #print "pick = $pick..\n"; return $pick; } sub array_numerical_sort{ return "NA" if scalar(@_) == 0; my @sorted = sort { $a <=> $b } @_; return (@sorted); } sub array_smallest_number{ return "NA" if scalar(@_) == 0; return $_[0] if scalar(@_) == 1; my @sorted = sort { $a <=> $b } @_; return $sorted[0]; } sub array_largest_number{ return "NA" if scalar(@_) == 0; return $_[0] if scalar(@_) == 1; my @sorted = sort { $a <=> $b } @_; return $sorted[$#sorted]; } sub array_largest_number_arrayPosition{ return "NA" if scalar(@_) == 0; return 0 if scalar(@_) == 1; my $maxpos = 0; my @maxposes = (); my @maxvals = (); my $maxval = array_smallest_number(@_); for my $i (0 ... $#_){ if ($_[$i] > $maxval){ $maxval = $_[$i]; $maxpos = $i; } if ($_[$i] == $maxval){ $maxval = $_[$i]; if (scalar(@maxposes) == 0){ push @maxposes, $i; push @maxvals, $_[$i]; } elsif ($maxvals[0] == $maxval){ push @maxposes, $i; push @maxvals, $_[$i]; } else{ @maxposes = (); @maxvals = (); push @maxposes, $i; push @maxvals, $_[$i]; } } } return $maxpos if scalar(@maxposes) < 2; return (@maxposes); } sub array_smallest_number_arrayPosition{ return "NA" if scalar(@_) == 0; return 0 if scalar(@_) == 1; my $minpos = 0; my @minposes = (); my @minvals = (); my $minval = array_largest_number(@_); my $maxval = array_smallest_number(@_); #print "starting with $maxval, ending with $minval\n"; for my $i (0 ... $#_){ if ($_[$i] < $minval){ $minval = $_[$i]; $minpos = $i; } if ($_[$i] == $minval){ $minval = $_[$i]; if (scalar(@minposes) == 0){ push @minposes, $i; push @minvals, $_[$i]; } elsif ($minvals[0] == $minval){ push @minposes, $i; push @minvals, $_[$i]; } else{ @minposes = (); @minvals = (); push @minposes, $i; push @minvals, $_[$i]; } } } #print "minposes=@minposes\n"; return $minpos if scalar(@minposes) < 2; return (@minposes); } sub basic_stats{ my @arr = @_; # print " array_smallest_number= ", array_smallest_number(@arr)," array_largest_number= ", array_largest_number(@arr), " array_mean= ",array_mean(@arr),"\n"; return ":"; } #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx sub maftoAxt_multispecies { my $printer = 0; # print "in maftoAxt_multispecies : got @_\n"; my $fname=$_[0]; open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; my $treedefinition = $_[1]; open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; my $counter = 0; my $exactspeciesset = $_[3]; my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); $treedefinition=~s/[\)\(, ]/\t/g; my @species=split(/\t+/,$treedefinition); my @exactspecies=(); foreach my $spec (@species){ foreach my $espec (@exactspeciesset_unarranged){ push @exactspecies, $spec if $spec eq $espec; } } # print "exactspecies=@exactspecies\n"; ########### my $select = 2; #select = 1 if all species need sequences to be present for each block otherwise, it is 0 #select = 2 only the allowed set make up the alignment. use the removeset # information to detect alignmenets that have other important genomes aligned. ########### my @allowedset = (); @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; @allowedset = join("_",0,@species) if $select == 1; #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; @allowedset = join("_",0,@exactspecies) if $select == 2; #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; my $start = 0; my @sequences = (); my @titles = (); my $species_counter = "0"; my $countermatch = 0; my $outsideSpecies=0; while(my $line = <IN>){ next if $line =~ /^#/; next if $line =~ /^i/; chomp $line; #print "$line"; my @fields = split(/\s+/,$line); chomp $line; if ($line =~ /^a /){ $start = 1; } if ($line =~ /^s /){ # print "fields1 = $fields[1] , start = $start\n"; foreach my $sp (@species){ if ($fields[1] =~ /$sp/){ $species_counter = $species_counter."_".$sp; push(@sequences, $fields[6]); my @sp_info = split(/\./,$fields[1]); my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); push(@titles, $title); } } } if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ my $arranged = reorderSpecies($species_counter, @species); my $stopper = 1; my $arrno = 0; foreach my $set (@allowedset){ if ($arranged eq $set){ # print "$arranged == $set\n"; $stopper = 0; last; } $arrno++; } if ($stopper == 0) { # print " accepted\n"; @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; my $filteredseq = filter_gaps(@sequences); if ($filteredseq ne "SHORT"){ $counter++; print OUT join (" ",$counter, @titles), "\n"; print OUT $filteredseq, "\n"; print OUT "\n"; $countermatch++; } # my @filtered_seq = split(/\t/,filter_gaps(@sequences) ); } else{#print "\n"; } @sequences = (); @titles = (); $start = 0;$species_counter = "0"; next; } } # print "countermatch = $countermatch\n"; } sub reorderSpecies{ my @inarr=@_; my $currSpecies = shift (@inarr); my $ordered_species = 0; my @species=@inarr; foreach my $order (@species){ $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; } return $ordered_species; } sub filter_gaps{ my @sequences = @_; # print "sequences sent are @sequences\n"; my $seq_length = length($sequences[0]); my $seq_no = scalar(@sequences); my $allgaps = (); for (1 ... $seq_no){ $allgaps = $allgaps."-"; } my @seq_array = (); my $seq_counter = 0; foreach my $seq (@sequences){ # my @sequence = split(/\s*/,$seq); $seq_array[$seq_counter] = [split(/\s*/,$seq)]; # push @seq_array, [@sequence]; $seq_counter++; } my $g = 0; while ( $g < $seq_length){ last if (!exists $seq_array[0][$g]); my $bases = (); for my $u (0 ... $#seq_array){ $bases = $bases.$seq_array[$u][$g]; } # print $bases, "\n"; if ($bases eq $allgaps){ # print "bases are $bases, position is $g \n"; for my $seq (@seq_array){ splice(@$seq , $g, 1); } } else { $g++; } } my @outs = (); foreach my $seq (@seq_array){ push(@outs, join("",@$seq)); } return "SHORT" if length($outs[0]) <=100; return (join("\n", @outs)); } sub allowedSetOfSpecies{ my @allowed_species = split(/_/,$_[0]); unshift @allowed_species, 0; # print "allowed set = @allowed_species \n"; my @output = (); for (0 ... scalar(@allowed_species) - 4){ push(@output, join("_",@allowed_species)); pop @allowed_species; } return join(";",reverse(@output)); } sub orderInfo{ my @info = split(/;/,$_[0]); # print "info = @info"; my @old = split(/_/,$_[1]); my @new = split(/_/,$_[2]); shift @old; shift @new; my @outinfo = (); foreach my $spe (@new){ for my $no (0 ... $#old){ if ($spe eq $old[$no]){ push(@outinfo, $info[$no]); } } } # print "outinfo = @outinfo \n"; return join(";", @outinfo); } #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx sub printarr { print ">::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n"; foreach my $line (@_) {print "$line\n";} print "::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::<\n"; }