comparison multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl @ 0:275433d3a395 draft

Uploaded tool tarball.
author devteam
date Wed, 25 Sep 2013 11:26:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:275433d3a395
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Term::ANSIColor;
5 use File::Basename;
6 use IO::Handle;
7 use Cwd;
8 use File::Path;
9 use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
10 use File::Path qw(make_path remove_tree);
11 use File::Temp qw/ tempfile tempdir /;
12 my $tdir = tempdir( CLEANUP => 1 );
13 chdir $tdir;
14 my $dir = getcwd;
15 #print "dir = $dir\n";
16
17 #$ENV{'PATH'} .= ':' . dirname($0);
18 my $date = `date`;
19
20 my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
21 if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
22
23 $tree_definition_all =~ s/\s+//g;
24 $threshold_array =~ s/\s+//g;
25 $allspeciesin =~ s/\s+//g;
26 #-------------------------------------------------------------------------------
27 # WHICH SPUTNIK USED?
28 my $sputnikpath = ();
29 $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
30 #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
31 #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
32 #exit if !-e $sputnikpath;
33 #$sputnikpath = "bx-sputnik" ;
34 #print "ARGV input = @ARGV\n";
35 #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
36 #-------------------------------------------------------------------------------
37 # RUNFILE
38 #-------------------------------------------------------------------------------
39 $distance = 1; #bp
40 $distance++;
41 my @tree_definitions=MakeTrees($tree_definition_all);
42 my $allspeciesset = $tree_definition_all;
43 $allspeciesset =~ s/[\(\) ]+//g;
44 @allspecies = split(/,/,$allspeciesset);
45
46 my @outputfiles = ();
47 my $round = 0;
48 #my $tdir = tempdir( CLEANUP => 0 );
49 #chdir $tdir;
50
51 foreach my $tree_definition (@tree_definitions){
52 my @commas = ($tree_definition =~ /,/g) ;
53 #print "commas = @commas\n"; <STDIN>;
54 next if scalar(@commas) <= 1;
55 #print "species_set = $species_set\n";
56 $treeSpeciesNum = scalar(@commas) + 1;
57 $species_set = $tree_definition;
58 $species_set =~ s/[\)\( ;]+//g;
59 #print "species_set = $species_set\n"; <STDIN>;
60
61 $round++;
62 #-------------------------------------------------------------------------------
63 # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
64 $threshold_array=~ s/,/_/g;
65 my @thresharr = split("_",$threshold_array);
66 @thresholds=@thresharr;
67 #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
68 #print "current dit=$dir\n";
69 #-------------------------------------------------------------------------------
70 # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
71 my @chrfiles=();
72
73 #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
74 my $chromt=int(rand(10000));
75 my $p_chr=$chromt;
76
77
78 my @exactspeciesset_unarranged = split(/,/,$species_set);
79 $tree_definition=~s/[\)\(, ]/\t/g;
80 my @treespecies=split(/\t+/,$tree_definition);
81 my @exactspecies=();
82
83 foreach my $spec (@treespecies){
84 foreach my $espec (@exactspeciesset_unarranged){
85 push @exactspecies, $spec if $spec eq $espec;
86 }
87 }
88 #print "exactspecies=@exactspecies\n";
89 $focalspec = $exactspecies[0];
90 my $arranged_species_set=join(".",@exactspecies);
91 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
92 my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
93 #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
94 maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
95 maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
96 #print "done maf to axt conversion\n";
97 my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
98 artificial_axdata_inverter ($chr_name, $reverse_chr_name);
99 #print "reverse_chr_name=$reverse_chr_name\n";
100 #-------------------------------------------------------------------------------
101 # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
102 foreach my $direct ("reverse_direction","forward_direction"){
103 $p_chr=$chromt;
104 #print "direction = $direct\n";
105 $p_chr = $p_chr."r" if $direct eq "reverse_direction";
106 $p_chr = $p_chr if $direct eq "forward_direction";
107 my $config = $species_set;
108 $config=~s/,/./g;
109 my @orgs = split(/\./,$arranged_species_set);
110 #print "ORGS= @orgs\n";
111 my @tag=@orgs;
112
113
114 my $tags = join(",", @tag);
115 my @tags=@tag;
116 chomp $p_chr;
117 $tags = join("_", split(/,/, $tags));
118 my $pchr = "chr".$p_chr;
119
120 my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
121 my @sp_tags = ();
122
123 # print "$ptag _ orthfile\n"; <STDIN>;
124 #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
125 foreach my $sp (@tag){
126 push(@sp_tags, ($sp.".".$ptag));
127 }
128
129 my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
130 my @presp_tags = ();
131
132 foreach my $sp (@tag){
133 push(@presp_tags, ($sp.".".$preptag));
134 }
135
136 my $resultdir = "";
137 my $orthdir = "";
138 my $filtereddir = "";
139 my $pipedir = "";
140
141 my @title_queries = ();
142 push(@title_queries, "^[0-9]+");
143 my $sep="\\s";
144 for my $or (0 ... $#orgs){
145 my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
146 #$title =~ s/chr\\+\\s+\+/chr/g;
147 push(@title_queries, $title);
148 }
149 my $title_query = join($sep, @title_queries);
150 #print "title_queries=@title_queries\n";
151 #print "query = >$title_query<\n";
152 #print "orgs = @orgs\n";
153 #-------------------------------------------------------------------------------
154 # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
155 my $t1input = $pchr.".".$arranged_species_set.".net.axt";
156
157 my @t1outputs = ();
158
159 foreach my $sp (@presp_tags){
160 push(@t1outputs, $sp."_gap_op");
161 }
162
163
164
165 multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
166 #print "t1outputs=@t1outputs\n";
167 #print "done t1\n"; <STDIN>;
168 #-------------------------------------------------------------------------------
169 #START T2.PL
170
171 my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
172
173 for my $t (0 ... scalar(@tags)-1){
174 multi_species_t2($t1outputs[$t], $tag[$t]);
175 }
176 #-------------------------------------------------------------------------------
177 #START T2.2.PL
178
179 my @temp_tags = @tag;
180
181 foreach my $sp (@presp_tags){
182 my $t2input = $sp."_nogap_op_unrand";
183 multi_species_t2_2($t2input, shift(@temp_tags));
184 }
185 undef (@temp_tags);
186
187 #-------------------------------------------------------------------------------
188 #START SPUTNIK
189
190 my @jobIDs = ();
191 @temp_tags = @tag;
192 my @sput_filelist = ();
193
194 foreach my $sp (@presp_tags){
195 #print "sp = $sp\n";
196 my $sputnikoutput = $pipedir.$sp."_sput_op0";
197 my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
198 push(@sput_filelist, $sputnikinput);
199 my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
200 # print "$sputnikcommand\n";
201 my @sputnikcommand_system = $sputnikcommand;
202 system(@sputnikcommand_system);
203 }
204
205 #-------------------------------------------------------------------------------
206 #START SPUTNIK OUTPUT CORRECTOR
207
208 foreach my $sp (@presp_tags){
209 my $corroutput = $pipedir.$sp."_sput_op1";
210 my $corrinput = $pipedir.$sp."_sput_op0";
211 sputnikoutput_corrector($corrinput,$corroutput);
212
213 my $t4output = $pipedir.$sp."_sput_op2";
214 multi_species_t4($corroutput,$t4output);
215
216 my $t5output = $pipedir.$sp."_sput_op3";
217 multi_species_t5($t4output,$t5output);
218 #print "done t5.pl for $sp\n";
219
220 my $t6output = $pipedir.$sp."_sput_op4";
221 multi_species_t6($t5output,$t6output,scalar(@orgs));
222 }
223 #-------------------------------------------------------------------------------
224 #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
225
226 foreach my $sp (@presp_tags){
227 my $t9output = $pipedir.$sp."_gap_op_unrand_match";
228 my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
229 my $t9micro = $pipedir.$sp."_sput_op4";
230 t9($t9micro,$t9sequence,$t9output);
231
232 my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
233 my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
234 t9($t9micro,$t9sequence2,$t9output2);
235 }
236 #print "done both t9.pl for all orgs\n";
237
238 #-------------------------------------------------------------------------------
239 # FIND COMPOUND MICROSATELLITES
240
241 @jobIDs = ();
242 my $species_counter = 0;
243
244 foreach my $sp (@presp_tags){
245 my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
246 my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
247 my $input_micro = $pipedir.$sp."_sput_op4";
248 my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
249 multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
250 $species_counter++;
251 }
252
253 #-------------------------------------------------------------------------------
254 # READING AND FILTERING SIMPLE MICROSATELLITES
255 my $spcounter2=0;
256 foreach my $sp (@sp_tags){
257 my $presp = $presp_tags[$spcounter2];
258 $spcounter2++;
259 my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
260 my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
261 my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
262 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
263 }
264
265 #-------------------------------------------------------------------------------
266 # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
267
268 $species_counter = 0;
269 foreach my $sp (@sp_tags){
270 my $presp = $presp_tags[$species_counter];
271 my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
272 my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
273 my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
274 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
275 multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
276 $species_counter++;
277 }
278 #-------------------------------------------------------------------------------
279 # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
280 $species_counter = 0;
281
282 foreach my $sp (@sp_tags){
283 my $presp = $presp_tags[$species_counter];
284 my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
285 my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
286 my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
287 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
288 multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
289 $species_counter++;
290 }
291 #-------------------------------------------------------------------------------
292 # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
293 $species_counter = 0;
294
295 foreach my $sp (@sp_tags){
296 my $presp = $presp_tags[$species_counter];
297
298 my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
299 my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
300 my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
301 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
302
303 my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
304 my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
305 my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
306 multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
307
308 my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
309 my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
310 my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
311 multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
312 $species_counter++;
313 }
314 #print "done filtering both simple and compound microsatellites \n";
315
316 #-------------------------------------------------------------------------------
317
318 my @combinedarray = ();
319 my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
320 my @combinedarray_tags = ("mono", "di", "tri", "tetra");
321 $species_counter = 0;
322
323 foreach my $sp (@sp_tags){
324 my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
325 push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
326 $species_counter++;
327 }
328
329 #-------------------------------------------------------------------------------
330 # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
331 my $sp_counter = 0;
332 foreach my $sp (@sp_tags){
333 my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
334 my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
335 my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
336 my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
337 `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
338 for my $i (0 ... 3){
339 `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
340 }
341 $sp_counter++;
342 }
343 #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
344
345 #-------------------------------------------------------------------------------
346 # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
347 # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
348 @temp_tags = @sp_tags;
349 my $increment = 1000000;
350 my @splist = ();
351 my $targetdir = $pipedir;
352 $species_counter=0;
353
354 foreach my $sp (@sp_tags){
355 my $presp = $presp_tags[$species_counter];
356 $species_counter++;
357 my $localtag = shift @temp_tags;
358 my $locallist = $targetdir.$localtag."_".$p_chr."_list";
359 push(@splist, $locallist);
360 my $input = $pipedir.$presp."_nogap_op_unrand2_match";
361 chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
362 }
363
364
365 my @unionarray = ();
366 #print "splist=@splist\n";
367 #-------------------------------------------------------------------------------
368 # FIND INTERRUPTED MICROSATELLITES
369
370 $species_counter = 0;
371
372 for my $i (0 .. $#combinedarray){
373
374 @jobIDs = ();
375 open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
376
377 while (my $sp1 = <JLIST1>){
378 #print "$splist[$i]: sp1=$sp1\n";
379 chomp $sp1;
380
381 for my $j (0 ... $#combinedarray_tags){
382 my $interr = $sp1."_interr_".$combinedarray_tags[$j];
383 my $simple = $sp1."_simple_".$combinedarray_tags[$j];
384 push @{$unionarray[$i]}, $interr, $simple;
385 multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
386 }
387 }
388 $species_counter++;
389 }
390 close JLIST1;
391 #-------------------------------------------------------------------------------
392 # REUNION AND ZIPPING BEFORE T10.PL
393
394 my @allarray = ();
395
396 for my $i (0 ... $#sp_tags){
397 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
398 unlink $localfile if -e $localfile;
399 push(@allarray, $localfile);
400
401 my $unfiltered_localfile= $localfile."_unfiltered";
402 my $residue_localfile= $localfile."_residue";
403
404 unlink $unfiltered_localfile;
405 #unlink $unfiltered_localfile;
406 for my $j (0 ... $#{$unionarray[$i]}){
407 #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
408 `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
409 unlink $unionarray[$i][$j];
410 }
411
412 multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
413 my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
414 my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
415 my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
416
417 `cat $analyzed_compound >> $localfile`;
418 }
419 #-------------------------------------------------------------------------------
420 # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
421
422 my $toescape=0;
423
424
425 for my $i (0 ... $#sp_tags){
426 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
427 $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
428 my $direction = $1;
429 #print "localfile = $localfile , direction = $direction\n";
430 # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
431 $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
432 last if $direction =~ /chr[0-9a-zA-Z]+r/;
433 my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
434 my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
435 my $reverselocal = $localfile;
436 $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
437 merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
438 #-------------------------------------------------------------------------------
439 my $forward_separate = $localfile."_separate";
440 my $reverse_separate = $reverselocal."_separate";
441 my $diff = $forward_separate."_diff";
442 my $miss = $forward_separate."_miss";
443 my $common = $forward_separate."_common";
444 forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
445 #-------------------------------------------------------------------------------
446 my $symmetrical_file = $localfile."_symmetrical";
447 my $merged_file = $localfile."_merged";
448 #print "cating: $merged_file $common into -> $symmetrical_file \n";
449 `cat $merged_file $common > $symmetrical_file`;
450 #-------------------------------------------------------------------------------
451 my $t10output = $symmetrical_file."_fin_hit_all_2";
452 new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
453 #-------------------------------------------------------------------------------
454 }
455 next if $toescape == 1;
456 #------------------------------------------------------------------------------------------------
457 # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
458
459
460 my @micros_array = ();
461 my $sampletag = ();
462 for my $i (0 ... $#sp_tags){
463 my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
464 push(@micros_array, $finhitFile);
465 $sampletag = $sp_tags[$i];
466 }
467 #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
468 #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
469 my $orthfiletemp = $ptag."_orthfile";
470 my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
471
472 my $maskedorthfiletemp = $ptag."_orthfile_masked";
473 qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
474
475 push @outputfiles , $maskedorthfiletemp;
476 }
477 $date = `date`;
478 }
479
480 `cat @outputfiles > $orthfile`;
481
482 my $rootdir = $dir;
483 $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
484 chdir $rootdir;
485 remove_tree($dir);
486
487 #print "date = $date\n";
488 #remove_tree($tdir);
489 #------------------------------------------------------------------------------------------------
490 #------------------------------------------------------------------------------------------------
491 #------------------------------------------------------------------------------------------------
492 #------------------------------------------------------------------------------------------------
493
494 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
495
496 sub maftoAxt_multispecies {
497 #print "in maftoAxt_multispecies : got @_\n";
498 my $fname=$_[0];
499 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
500 my $treedefinition = $_[1];
501 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
502 my $counter = 0;
503 my $exactspeciesset = $_[3];
504 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
505
506 $treedefinition=~s/[\)\(, ]/\t/g;
507 my @species=split(/\t+/,$treedefinition);
508 my @exactspecies=();
509
510 foreach my $spec (@species){
511 foreach my $espec (@exactspeciesset_unarranged){
512 push @exactspecies, $spec if $spec eq $espec;
513 }
514 }
515 #print "exactspecies=@exactspecies\n";
516
517 ###########
518 my $select = 2;
519 #select = 1 if all species need sequences to be present for each block otherwise, it is 0
520 #select = 2 only the allowed set make up the alignment. use the removeset
521 # information to detect alignmenets that have other important genomes aligned.
522 ###########
523 my @allowedset = ();
524 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
525 @allowedset = join("_",0,@species) if $select == 1;
526 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
527 @allowedset = join("_",0,@exactspecies) if $select == 2;
528 #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
529
530 my $start = 0;
531 my @sequences = ();
532 my @titles = ();
533 my $species_counter = "0";
534 my $countermatch = 0;
535 my $outsideSpecies=0;
536
537 while(my $line = <IN>){
538 # print $line;
539 next if $line =~ /^#/;
540 next if $line =~ /^i/;
541 chomp $line;
542 my @fields = split(/\s+/,$line);
543 chomp $line;
544 if ($line =~ /^a /){
545 $start = 1;
546 }
547
548 if ($line =~ /^s /){
549
550 foreach my $sp (@allspecies){
551 # print "checking species $sp\n";
552 if ($fields[1] =~ /$sp/){
553 $species_counter = $species_counter."_".$sp;
554 push(@sequences, $fields[6]);
555 my @sp_info = split(/\./,$fields[1]);
556 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
557 push(@titles, $title);
558 # print "species_counter = $species_counter\n";
559 }
560 }
561 }
562
563 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
564 # print "species_counter = $species_counter\n";
565 my $arranged = reorderSpecies($species_counter, @allspecies);
566 my $stopper = 1;
567 my $arrno = 0;
568
569 # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
570 if (scalar(@sequences) == scalar(@exactspecies)){
571 foreach my $set (@allowedset){
572 # print "testing $arranged against $set\n";
573 if ($arranged eq $set){
574 $stopper = 0; last;
575 }
576 $arrno++;
577 }
578 }
579 else{
580 $stopper = 1;
581 }
582
583
584 if ($stopper == 0) {
585 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
586 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
587 my $filteredseq = filter_gaps(@sequences);
588
589 if ($filteredseq ne "SHORT"){
590 #print "printing"; <STDIN>;
591 $counter++;
592 print OUT join (" ",$counter, @titles), "\n";
593 print OUT $filteredseq, "\n";
594 print OUT "\n";
595 $countermatch++;
596 }
597 }
598 else{ #print "nexting\n";<STDIN>;
599 }
600
601 @sequences = (); @titles = (); $start = 0;$species_counter = "0";
602 next;
603
604 }
605 }
606 # print "countermatch = $countermatch\n";
607 }
608
609 sub reorderSpecies{
610 my @inarr=@_;
611 my $currSpecies = shift (@inarr);
612 my $ordered_species = 0;
613 my @species=@inarr;
614 #print "species = @species\n";
615 foreach my $order (@species){
616 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
617 }
618 return $ordered_species;
619
620 }
621
622 sub filter_gaps{
623 my @sequences = @_;
624 # print "sequences sent are @sequences\n";
625 my $seq_length = length($sequences[0]);
626 my $seq_no = scalar(@sequences);
627 my $allgaps = ();
628 for (1 ... $seq_no){
629 $allgaps = $allgaps."-";
630 }
631
632 my @seq_array = ();
633 my $seq_counter = 0;
634 foreach my $seq (@sequences){
635 # my @sequence = split(/\s*/,$seq);
636 $seq_array[$seq_counter] = [split(/\s*/,$seq)];
637 # push @seq_array, [@sequence];
638 $seq_counter++;
639 }
640 my $g = 0;
641 while ( $g < $seq_length){
642 last if (!exists $seq_array[0][$g]);
643 my $bases = ();
644 for my $u (0 ... $#seq_array){
645 $bases = $bases.$seq_array[$u][$g];
646 }
647 # print $bases, "\n";
648 if ($bases eq $allgaps){
649 # print "bases are $bases, position is $g \n";
650 for my $seq (@seq_array){
651 splice(@$seq , $g, 1);
652 }
653 }
654 else {
655 $g++;
656 }
657 }
658
659 my @outs = ();
660
661 foreach my $seq (@seq_array){
662 push(@outs, join("",@$seq));
663 }
664 return "SHORT" if length($outs[0]) <=100;
665 return (join("\n", @outs));
666 }
667
668
669 sub allowedSetOfSpecies{
670 my @allowed_species = split(/_/,$_[0]);
671 unshift @allowed_species, 0;
672 # print "allowed set = @allowed_species \n";
673 my @output = ();
674 for (0 ... scalar(@allowed_species) - 4){
675 push(@output, join("_",@allowed_species));
676 pop @allowed_species;
677 }
678 return join(";",reverse(@output));
679
680 }
681
682
683 sub orderInfo{
684 my @info = split(/;/,$_[0]);
685 # print "info = @info";
686 my @old = split(/_/,$_[1]);
687 my @new = split(/_/,$_[2]);
688 shift @old; shift @new;
689 my @outinfo = ();
690 foreach my $spe (@new){
691 for my $no (0 ... $#old){
692 if ($spe eq $old[$no]){
693 push(@outinfo, $info[$no]);
694 }
695 }
696 }
697 # print "outinfo = @outinfo \n";
698 return join(";", @outinfo);
699 }
700
701 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
702
703 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
704 sub artificial_axdata_inverter{
705 open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
706 open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
707 my $linecounter=0;
708 while (my $line = <IN>){
709 $linecounter++;
710 #print "$linecounter\n";
711 chomp $line;
712 my $final_line = $line;
713 my $trycounter = 0;
714 if ($line =~ /^[a-zA-Z\-]/){
715 # while ($final_line eq $line){
716 my @fields = split(/\s*/,$line);
717
718 $final_line = join("",reverse(@fields));
719 # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
720 # $trycounter++;
721 # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
722 # }
723 }
724
725 # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
726 if ($line =~ /^[0-9]/){
727 $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
728 $final_line = $line;
729 }
730 print OUT $final_line,"\n";
731 #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
732 }
733 close OUT;
734 }
735 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
736
737
738 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
739
740 sub multi_species_t1 {
741
742 my $input1 = $_[0];
743 #print "@_\n"; <STDIN>;
744 my @tags = split(/_/, $_[1]);
745 my @outputs = split(/,/, $_[2]);
746 my $title_query = $_[3];
747 my @handles = ();
748
749 open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
750 my $i = 0;
751 foreach my $path (@outputs){
752 $handles[$i] = IO::Handle->new();
753 open ($handles[$i], ">$path") or die "Can't open $path : $!";
754 $i++;
755 }
756
757 my $curdef;
758 my $start = 0;
759
760 while (my $line = <FILEB> ) {
761 if ($line =~ /^\d/){
762 $line =~ s/ +/\t/g;
763 my @fields = split(/\s+/, $line);
764 if (($line =~ /$title_query/)){
765 my $title = $line;
766 my $counter = 0;
767 foreach my $tag (@tags){
768 $line = <FILEB>;
769 print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
770 $counter++;
771 }
772 }
773 else{
774 foreach my $tag (@tags){
775 my $tine = <FILEB>;
776 }
777 }
778
779 }
780 }
781
782 foreach my $hand (@handles){
783 $hand->close();
784 }
785
786 close FILEB;
787 }
788
789 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
790
791 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
792
793 sub multi_species_t2{
794
795 my $input = $_[0];
796 my $species = $_[1];
797 my $output1 = $input."_unr";
798
799 #------------------------------------------------------------------------------------------
800 open (FILEF1, "<$input") or die "Cannot open file $input :$!";
801 open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
802
803 my $line1 = <FILEF1>;
804
805 while($line1){
806 {
807 # chomp($line);
808 if ($line1 =~ (m/^\>$species/)){
809 chomp($line1);
810 print FILEF2 $line1;
811 $line1 = <FILEF1>;
812 chomp($line1);
813 print FILEF2 "\t", $line1,"\n";
814 }
815 }
816 $line1 = <FILEF1>;
817 }
818
819 close FILEF1;
820 close FILEF2;
821 #------------------------------------------------------------------------------------------
822
823 my $output2 = $output1."and";
824 my $output3 = $output1."and2";
825 open(IN,"<$output1");
826 open (FILEF3, ">$output2");
827 open (FILEF4, ">$output3");
828
829
830 while (<IN>){
831 my $line = $_;
832 chomp($line);
833 my @fields=split (/\t/, $line);
834 # print $line,"\n"; <STDIN>;
835 if($line !~ /random/){
836 print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
837 print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
838 }
839 }
840
841
842 close IN;
843 close FILEF3;
844 close FILEF4;
845 unlink $output1;
846
847 #------------------------------------------------------------------------------------------
848 # OLD T3.PL RUDIMENT
849
850 my $t3output = $output2;
851 $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
852
853 open(IN,"<$output2");
854 open(OUTA,">$t3output");
855
856
857 while (<IN>){
858 s/-//g unless /^>/;
859 print OUTA;
860 }
861
862 close IN;
863 close OUTA;
864 #------------------------------------------------------------------------------------------
865 }
866 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
867
868
869 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
870 sub multi_species_t2_2{
871 #print "IN multi_species_t2_2 : @_\n";
872 my $input = $_[0];
873 my $species = $_[1];
874 my $output1 = $input."2";
875
876
877 open (FILEF1, "<$input");
878 open (FILEF2, ">$output1");
879
880 my $line1 = <FILEF1>;
881
882 while($line1){
883 {
884 # chomp($line);
885 if ($line1 =~ (m/^\>$species/)){
886 chomp($line1);
887 print FILEF2 $line1;
888 $line1 = <FILEF1>;
889 chomp($line1);
890 print FILEF2 "\t", $line1,"\n";
891 }
892 }
893 $line1 = <FILEF1>;
894 }
895
896 close FILEF1;
897 close FILEF2;
898 }
899
900 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
901
902
903 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
904 sub sputnikoutput_corrector{
905 my $input = $_[0];
906 my $output = $_[1];
907 open(IN,"<$input") or die "Cannot open file $input :$!";
908 open(OUT,">$output") or die "Cannot open file $output :$!";
909 my $tine;
910 while (my $line=<IN>){
911 if($line =~/length /){
912 $tine = $line;
913 $tine =~ s/\s+/\t/g;
914 my @fields = split(/\t/,$tine);
915 if ($fields[6] > 60){
916 print OUT $line;
917 $line = <IN>;
918
919 while (($line !~ /nucleotide/) && ($line !~ /^>/)){
920 chomp $line;
921 print OUT $line;
922 $line = <IN>;
923 }
924 print OUT "\n";
925 print OUT $line;
926 }
927 else{
928 print OUT $line;
929 }
930 }
931 else{
932 print OUT $line;
933 }
934 }
935 close IN;
936 close OUT;
937 }
938 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
939
940
941 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
942 sub multi_species_t4{
943 # print "multi_species_t4 : @_\n";
944 my $input = $_[0];
945 my $output = $_[1];
946 open (FILEA, "<$input");
947 open (FILEB, ">$output");
948
949 my $line = <FILEA>;
950
951 while ($line) {
952 # chomp $line;
953 if ($line =~ />/) {
954 chomp $line;
955 print FILEB $line, "\n";
956 }
957
958
959 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
960 chomp $line;
961 print FILEB $line, " " ;
962 $line = <FILEA>;
963 chomp $line;
964 print FILEB $line,"\n";
965 }
966
967 $line = <FILEA>;
968 }
969
970
971 close FILEA;
972 close FILEB;
973
974 }
975
976 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
977
978
979 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
980 sub multi_species_t5{
981
982 my $input = $_[0];
983 my $output = $_[1];
984
985 open(FILEB,"<$input");
986 open(FILEC,">$output");
987
988 my $curdef;
989
990 while (my $line = <FILEB> ) {
991
992 if ($line =~ /^>/){
993 chomp $line;
994 $curdef = $line;
995 next;
996 }
997
998 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
999 print FILEC $curdef," ",$line;
1000 }
1001
1002 }
1003
1004
1005 close FILEB;
1006 close FILEC;
1007
1008 }
1009 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
1010
1011
1012 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
1013 sub multi_species_t6{
1014 my $input = $_[0];
1015 my $output = $_[1];
1016 my $focalstrand=$_[3];
1017 # print "inpput = @_\n";
1018 open (FILE, "<$input");
1019 open (FILE_MICRO, ">$output");
1020 my $linecounter=0;
1021 while (my $line = <FILE>){
1022 $linecounter++;
1023 chomp $line;
1024 #print "line = $line\n";
1025 #MONO#
1026 $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
1027 my $strand=$1;
1028 my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
1029 #print "line = $line\n";
1030 my $specfieldsend = 2 + ($no_of_species*4) - 1;
1031 my @fields = split(/\s+/, $line);
1032 my @speciesdata = @fields[0 ... $specfieldsend];
1033 $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
1034 my ($tide, $start, $end) = ($1, $2, $3);
1035 #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
1036 if($line =~ /mononucleotide/){
1037 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
1038 }
1039 #DI#
1040 elsif($line =~ /dinucleotide/){
1041 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
1042 }
1043 #TRI#
1044 elsif($line =~ /trinucleotide/ ){
1045 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
1046 }
1047 #TETRA#
1048 elsif($line =~ /tetranucleotide/){
1049 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
1050 }
1051 #PENTA#
1052 elsif($line =~ /pentanucleotide/){
1053 #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
1054 }
1055 else{
1056 # print "not: @fields\n";
1057 }
1058 }
1059 # print "linecounter=$linecounter\n";
1060 close FILE;
1061 close FILE_MICRO;
1062 }
1063
1064 sub mono {
1065 my $st = $_[0];
1066 my $tp = unpack "A1"x(length($st)/1),$st;
1067 my $var1 = substr($tp, 0, 1);
1068 return join ("\t", $var1);
1069 }
1070 sub di {
1071 my $st = $_[0];
1072 my $tp = unpack "A2"x(length($st)/2),$st;
1073 my $var1 = substr($tp, 0, 2);
1074 return join ("\t", $var1);
1075 }
1076 sub tri {
1077 my $st = $_[0];
1078 my $tp = unpack "A3"x(length($st)/3),$st;
1079 my $var1 = substr($tp, 0, 3);
1080 return join ("\t", $var1);
1081 }
1082 sub tetra {
1083 my $st = $_[0];
1084 my $tp = unpack "A4"x(length($st)/4),$st;
1085 my $var1 = substr($tp, 0, 4);
1086 return join ("\t", $var1);
1087 }
1088 sub penta {
1089 my $st = $_[0];
1090 my $tp = unpack "A5"x(length($st)/5),$st;
1091 my $var1 = substr($tp, 0, 5);
1092 return join ("\t", $var1);
1093 }
1094
1095 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
1096
1097
1098 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
1099 sub t9{
1100 my $input1 = $_[0];
1101 my $input2 = $_[1];
1102 my $output = $_[2];
1103
1104
1105 open(IN1,"<$input1") if -e $input1;
1106 open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
1107 open(OUT,">$output") or die "cannot open file $_[2] : $!";
1108
1109
1110 my %seen = ();
1111 my $prevkey = 0;
1112
1113 if (-e $input1){
1114 while (my $line = <IN1>){
1115 chomp($line);
1116 my @fields = split(/\t/,$line);
1117 my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
1118 # print "key in t9 = $key1\n";
1119 $seen{$key1}++ if ($prevkey ne $key1) ;
1120 $prevkey = $key1;
1121 }
1122 # print "done first hash\n";
1123 close IN1;
1124 }
1125
1126 while (my $line = <IN2>){
1127 # print $line, "**\n";
1128 if (-e $input1){
1129 chomp($line);
1130 my @fields = split(/\t/,$line);
1131 my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
1132 if (exists $seen{$key2}){
1133 print OUT "$line\n" ;
1134 delete $seen{$key2};
1135 }
1136 }
1137 else {
1138 print OUT "$line\n" ;
1139 # print "$line\n" ;
1140 }
1141 }
1142
1143 close IN2;
1144 close OUT;
1145 }
1146 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
1147
1148
1149 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
1150
1151
1152 sub multiSpecies_compound_microsat_hunter3{
1153
1154 my $input1 = $_[0]; ###### the *_sput_op4_ii file
1155 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
1156 my $output1 = $_[2]; ###### plain microsatellite file
1157 my $output2 = $_[3]; ###### compound microsatellite file
1158 my $org = $_[4]; ###### 1 or 2
1159 $no_of_species = $_[5];
1160 #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
1161 #my @tags = split(/\t/,$info);
1162 sub compoundify;
1163 open(IN,"<$input1") or die "Cannot open file $input1 $!";
1164 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
1165 open(OUT,">$output1") or die "Cannot open file $output1 $!";
1166 open(OUT2,">$output2") or die "Cannot open file $output2 $!";
1167 $infocord = 2 + (4*$no_of_species) - 1;
1168 $startcord = 2 + (4*$no_of_species) + 2 - 1;
1169 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
1170 $endcord = 2 + (4*$no_of_species) + 4 - 1;
1171 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
1172 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
1173 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
1174
1175 my @thresholds = ("0");
1176 push(@thresholds, split(/_/,$_[6]));
1177 sub thresholdCheck;
1178 my %micros = ();
1179 while (my $line = <IN>){
1180 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
1181 next if $line =~ /\t\t/;
1182 if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
1183 my $key = join("\t",$1, $2, $3, $4, $5);
1184 # print $key, "#-#-#-#-#-#-#-#\n";
1185 push (@{$micros{$key}},$line);
1186 }
1187 else{
1188 }
1189 }
1190 close IN;
1191 my @deletedlines = ();
1192
1193 my $linecount = 0;
1194
1195 while(my $sine = <SEQ>){
1196 my %microstart=();
1197 my %microend=();
1198
1199 my @sields = split(/\t/,$sine);
1200
1201 my $key = ();
1202
1203 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
1204 $key = join("\t",$1, $2, $3, $4, $5);
1205 # print $key, "<-<-<-<-<-<-<-<\n";
1206 }
1207 else{
1208 }
1209
1210 if (exists $micros{$key}){
1211 $linecount++;
1212 my @microstring = @{$micros{$key}};
1213 my @tempmicrostring = @{$micros{$key}};
1214
1215 foreach my $line (@tempmicrostring){
1216 my @fields = split(/\t/,$line);
1217 my $start = $fields[$startcord];
1218 my $end = $fields[$endcord];
1219 push (@{$microstart{$start}},$line);
1220 push (@{$microend{$end}},$line);
1221 }
1222 my $firstflag = 'down';
1223 while( my $line =shift(@microstring)){
1224 # print "-----------\nline = $line ";
1225 chomp $line;
1226 my @fields = split(/\t/,$line);
1227 my $start = $fields[$startcord];
1228 my $end = $fields[$endcord];
1229 my $startmicro = $line;
1230 my $endmicro = $line;
1231
1232 # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
1233
1234 delete ($microstart{$start});
1235 delete ($microend{$end});
1236 my $flag = 'down';
1237 my $startflag = 'down';
1238 my $endflag = 'down';
1239 my $prestart = $start - $distance;
1240 my $postend = $end + $distance;
1241 my @compoundlines = ();
1242 my %compoundhash = ();
1243 push (@compoundlines, $line);
1244 push (@{$compoundhash{$line}},$line);
1245 my $startrank = 1;
1246 my $endrank = 1;
1247
1248 while( ($startflag eq "down") || ($endflag eq "down") ){
1249 if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
1250 # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
1251 last;
1252 }
1253
1254 $firstflag = "up";
1255 if ($startflag eq "down"){
1256 for my $i ($prestart ... $start){
1257
1258 if(exists $microend{$i}){
1259 chomp $microend{$i}[0];
1260 if(exists $compoundhash{$microend{$i}[0]}) {next;}
1261 # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
1262 if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
1263 push(@compoundlines, $microend{$i}[0]);
1264 # print "accepted\n";
1265 my @tields = split(/\t/,$microend{$i}[0]);
1266 $startmicro = $microend{$i}[0];
1267 chomp $startmicro;
1268 $start = $tields[$startcord];
1269 $flag = 'down';
1270 $startrank++;
1271 # print "startcompund = $microend{$i}[0]\n";
1272 delete $microend{$i};
1273 delete $microstart{$start};
1274 $startflag = 'down';
1275 $prestart = $start - $distance;
1276 last;
1277 }
1278 else{
1279 $flag = 'up';
1280 $startflag = 'up';
1281 }
1282 }
1283 else{
1284 $flag = 'up';
1285 $startflag = 'up';
1286 }
1287 }
1288 }
1289
1290 $endrank = $startrank;
1291
1292 if ($endflag eq "down"){
1293 for my $i ($end ... $postend){
1294
1295 if(exists $microstart{$i} ){
1296 chomp $microstart{$i}[0];
1297 if(exists $compoundhash{$microstart{$i}[0]}) {next;}
1298 # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
1299
1300 if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
1301 push(@compoundlines, $microstart{$i}[0]);
1302 # print "accepted\n";
1303 my @tields = split(/\t/,$microstart{$i}[0]);
1304 $end = $tields[$endcord]-0;
1305 $endmicro = $microstart{$i}[0];
1306 $endrank++;
1307 chomp $endmicro;
1308 $flag = 'down';
1309 # print "endcompund = $microstart{$i}[0]\n";
1310 delete $microstart{$i};
1311 delete $microend{$end};
1312 shift @microstring;
1313 $postend = $end + $distance;
1314 $endflag = 'down';
1315 last;
1316 }
1317 else{
1318 $flag = 'up';
1319 $endflag = 'up';
1320 }
1321 }
1322 else{
1323 $flag = 'up';
1324 $endflag = 'up';
1325 }
1326 }
1327 }
1328 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
1329 } #end while( $flag eq "down")
1330 # print "compoundlines = @compoundlines \n";
1331 if (scalar (@compoundlines) == 1){
1332 print OUT $line,"\n";
1333 }
1334 if (scalar (@compoundlines) > 1){
1335 my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
1336 # print $compoundline,"\n";
1337 print OUT2 $compoundline,"\n";
1338 }
1339 } #end foreach my $line (@microstring){
1340 } #if (exists $micros{$key}){
1341
1342
1343 }
1344
1345 close OUT;
1346 close OUT2;
1347 }
1348
1349
1350 #------------------------------------------------------------------------------------------------
1351 sub compoundify{
1352 my ($compoundlines, $sequence) = @_;
1353 # print "\nfound to compound : @$compoundlines and$sequence \n";
1354 my $noOfComps = @$compoundlines;
1355 # print "Number of elements in hash is $noOfComps\n";
1356 my @starts;
1357 my @ends;
1358 foreach my $line (@$compoundlines){
1359 # print "compoundify.. line = $line \n";
1360 chomp $line;
1361 my @fields = split(/\t/,$line);
1362 my $start = $fields[$startcord];
1363 my $end = $fields[$endcord];
1364 # print "start = $start, end = $end \n";
1365 push(@starts, $start);
1366 push(@ends,$end);
1367 }
1368 my @temp = @$compoundlines;
1369 my $startline=$temp[0];
1370 my @mields = split(/\t/,$startline);
1371 my $startcoord = $mields[$startcord];
1372 my $startgapsign=$mields[$endcord];
1373 my @startsorted = sort { $a <=> $b } @starts;
1374 my @endsorted = sort { $a <=> $b } @ends;
1375 my @intervals;
1376 for my $end (0 ... (scalar(@endsorted)-2)){
1377 my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
1378 push(@intervals,$interval);
1379 # print "interval = $interval =\n";
1380 # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
1381 }
1382 push(@intervals,"");
1383 my $compoundmicrosat=();
1384 my $multiunit="";
1385 foreach my $line (@$compoundlines){
1386 my @fields = split(/\t/,$line);
1387 my $component="[".$fields[$microsatcord]."]".shift(@intervals);
1388 $compoundmicrosat=$compoundmicrosat.$component;
1389 $multiunit=$multiunit."[".$fields[$motifcord]."]";
1390 # print "multiunit = $multiunit\n";
1391 }
1392 my $compoundcopy = $compoundmicrosat;
1393 $compoundcopy =~ s/\[|\]//g;
1394 my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
1395
1396
1397 my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
1398 return $compoundline;
1399 }
1400
1401 #------------------------------------------------------------------------------------------------
1402
1403 sub identityMatch_thresholdCheck{
1404 my $line1 = $_[0];
1405 my $line2 = $_[1];
1406 my $rank = $_[2];
1407 my @lields1 = split(/\t/,$line1);
1408 my @lields2 = split(/\t/,$line2);
1409 # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
1410
1411 if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
1412 my $probe = $lields1[$motifcord].$lields1[$motifcord];
1413 #print "$probe :: $lields2[$motifcord]\n";
1414 return "proceed" if $probe =~ /$lields2[$motifcord]/;
1415 #print "line recieved\n";
1416 if ($rank ==1){
1417 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
1418 }
1419 else {
1420 return "proceed" if thresholdCheck($line2) eq "proceed";
1421 return "stop";
1422 }
1423 }
1424 else{
1425 if ($rank ==1){
1426 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
1427 }
1428 else {
1429 return "proceed" if thresholdCheck($line2) eq "proceed";
1430 return "stop";
1431 }
1432 }
1433 return "stop";
1434 }
1435 #------------------------------------------------------------------------------------------------
1436
1437 sub thresholdCheck{
1438 my @checkthresholds=(0,@thresholds);
1439 #print "IN thresholdCheck: @_\n";
1440 my $line = $_[0];
1441 my @lields = split(/\t/,$line);
1442 return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
1443 return "stop";
1444 }
1445 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
1446
1447
1448 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
1449
1450 sub multiSpecies_filtering_interrupted_microsats{
1451 # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
1452 my $unfiltered = $_[0];
1453 my $filtered = $_[1];
1454 my $residue = $_[2];
1455 my $no_of_species = $_[5];
1456 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
1457 open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
1458 open(RES,">$residue") or die "Cannot open file $residue: $!";
1459
1460 $infocord = 2 + (4*$no_of_species) - 1;
1461 $startcord = 2 + (4*$no_of_species) + 2 - 1;
1462 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
1463 $endcord = 2 + (4*$no_of_species) + 4 - 1;
1464 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
1465 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
1466
1467
1468 my @sub_thresholds = (0);
1469
1470 push(@sub_thresholds, split(/_/,$_[3]));
1471 my @thresholds = (0);
1472
1473 push(@thresholds, split(/_/,$_[4]));
1474
1475 while (my $line = <UNF>) {
1476 next if $line !~ /[a-z]/;
1477 #print $line;
1478 chomp $line;
1479 my @fields = split(/\t/,$line);
1480 my $motif = $fields[$motifcord];
1481 my $realmotif = $motif;
1482 #print "motif = $motif\n";
1483 if ($motif =~ /^\[/){
1484 $motif =~ s/^\[//g;
1485 my @motifs = split(/\]/,$motif);
1486 $realmotif = $motifs[0];
1487 }
1488 # print "realmotif = $realmotif";
1489 my $motif_size = length($realmotif);
1490
1491 my $microsat = $fields[$microsatcord];
1492 # print "microsat = $microsat\n";
1493 $microsat =~ s/^\[|\]$//sg;
1494 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
1495
1496 $microsat = join("",@microsats);
1497 if (length($microsat) < $thresholds[$motif_size]) {
1498 # print length($microsat)," < ",$thresholds[$motif_size],"\n";
1499 print RES $line,"\n"; next;
1500 }
1501 my @lengths = ();
1502 foreach my $mic (@microsats){
1503 push(@lengths, length($mic));
1504 }
1505 if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
1506 # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
1507 print RES $line,"\n"; next;}
1508 else {print FIL $line,"\n"; next;
1509 }
1510 }
1511 close FIL;
1512 close RES;
1513
1514 }
1515
1516 sub largest_microsat{
1517 my $counter = 0;
1518 my($max) = shift(@_);
1519 foreach my $temp (@_) {
1520 #print "finding largest array: $maxcounter \n";
1521 if($temp > $max){
1522 $max = $temp;
1523 }
1524 }
1525 return($max);
1526 }
1527
1528 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
1529
1530
1531 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
1532 sub multiSpecies_compound_microsat_analyzer{
1533 ####### PARAMETER ########
1534 ##########################
1535
1536 my $input1 = $_[0]; ###### the *_sput_op4_ii file
1537 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
1538 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
1539 my $output2 = $_[3]; ###### the pure compound microsatellites
1540 my $org = $_[4];
1541 my $no_of_species = $_[5];
1542 # print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n";
1543 $infocord = 2 + (4*$no_of_species) - 1;
1544 $typecord = 2 + (4*$no_of_species) + 1 - 1;
1545 $startcord = 2 + (4*$no_of_species) + 2 - 1;
1546 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
1547 $endcord = 2 + (4*$no_of_species) + 4 - 1;
1548 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
1549 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
1550
1551 open(IN,"<$input1") or die "Cannot open file $input1 $!";
1552 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
1553
1554 open(OUT,">$output1") or die "Cannot open file $output1 $!";
1555 open(OUT2,">$output2") or die "Cannot open file $output2 $!";
1556
1557
1558 # print "opened files \n";
1559 my %micros = ();
1560 my $keycounter=0;
1561 my $linecounter=0;
1562 while (my $line = <IN>){
1563 $linecounter++;
1564 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
1565 my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
1566 push (@{$micros{$key}},$line);
1567 $keycounter++;
1568 }
1569 else{
1570 # print "no key\n";
1571 }
1572 }
1573 close IN;
1574 my @deletedlines = ();
1575 # print "done hash . linecounter=$linecounter, keycounter=$keycounter\n";
1576 #---------------------------------------------------------------------------------------------------
1577 # NOW READING THE SEQUENCE FILE
1578 my $keyfound=0;
1579 my $keyexists=0;
1580 my $inter=0;
1581 my $pure=0;
1582
1583 while(my $sine = <SEQ>){
1584 my %microstart=();
1585 my %microend=();
1586 my @sields = split(/\t/,$sine);
1587 my $key = 0;
1588 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
1589 $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
1590 # print $sine;
1591 # print $key;
1592 $keyfound++;
1593 }
1594 else{
1595
1596 }
1597 #<STDIN> if !defined $key;
1598
1599 if (exists $micros{$key}){
1600 $keyexists++;
1601 my @microstring = @{$micros{$key}};
1602
1603 my @filteredmicrostring;
1604
1605 foreach my $line (@microstring){
1606 chomp $line;
1607 my $copy_line = $line;
1608 my @fields = split(/\t/,$line);
1609 my $start = $fields[$startcord];
1610 my $end = $fields[$endcord];
1611 # FOR COMPOUND MICROSATELLITES
1612 if ($fields[$typecord] eq "compound"){
1613 $line = compound_microsat_analyser($line);
1614 if ($line eq "NULL") {
1615 print OUT2 "$copy_line\n";
1616 $pure++;
1617 next;
1618 }
1619 else{
1620 print OUT "$line\n";
1621 $inter++;
1622 next;
1623 }
1624 }
1625 }
1626
1627 } #if (exists $micros{$key}){
1628 }
1629 close OUT;
1630 close OUT2;
1631 # print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n";
1632 }
1633
1634 sub compound_microsat_analyser{
1635 my $line = $_[0];
1636 my @fields = split(/\t/,$line);
1637 my $motifline = $fields[$motifcord];
1638 my $microsat = $fields[$microsatcord];
1639 $motifline =~ s/^\[|\]$//g;
1640 $microsat =~ s/^\[|\]$//g;
1641 $microsat =~ s/-//g;
1642 my @interruptions = ();
1643 my @motields = split(/\]\[/,$motifline);
1644 my @microields = split(/\][a-zA-Z|-]*\[/,$microsat);
1645 my @inields = split(/[.*]/,$microsat);
1646 shift @inields;
1647 my @motifcount = scalar(@motields);
1648 my $prevmotif = $motields[0];
1649 my $prevmicro = $microields[0];
1650 my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0]));
1651 my $localflag = 'down';
1652 my @infoarray = ();
1653
1654 for my $l (1 ... (scalar(@motields)-1)){
1655 my $probe = $prevmotif.$prevmotif;
1656 if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;}
1657
1658 if ($probe =~ /$motields[$l]/i){
1659 my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l]));
1660 my $curr_startphase = substr($microields[$l],0,length($motields[$l]));
1661 if ($curr_startphase =~ /$prevphase/i) {
1662 $infoarray[$l-1] = "insertion";
1663 }
1664 else {
1665 $infoarray[$l-1] = "indel/substitution";
1666 }
1667
1668 $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase;
1669 next;
1670 }
1671 else {$localflag = "up"; last;}
1672 }
1673 if ($localflag eq 'up') {return "NULL";}
1674
1675 if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";}
1676 if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";}
1677 if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";}
1678 if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";}
1679 if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";}
1680
1681 @microields = split(/[\[|\]]/,$microsat);
1682 my @microsats = ();
1683 my @positions = ();
1684 my $lengthtracker = 0;
1685
1686 for my $i (0 ... (scalar(@microields ) - 1)){
1687 if ($i%2 == 0){
1688 push(@microsats,$microields[$i]);
1689 $lengthtracker = $lengthtracker + length($microields[$i]);
1690
1691 }
1692 else{
1693 push(@interruptions,$microields[$i]);
1694 push(@positions, $lengthtracker+1);
1695 $lengthtracker = $lengthtracker + length($microields[$i]);
1696 }
1697
1698 }
1699 my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions)));
1700 return($returnline);
1701 }
1702
1703 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
1704
1705
1706 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
1707
1708 sub multiSpecies_compoundClarifyer{
1709 # print "IN multiSpecies_compoundClarifyer: @_\n";
1710 my $input1 = $_[0]; ###### the *_sput_compound
1711 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
1712 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
1713 my $output2 = $_[3]; ###### compound file
1714 my $org = $_[4];
1715 my $no_of_species = $_[5];
1716 @thresholds = "0";
1717 push(@thresholds, split(/_/,$_[6]));
1718
1719
1720 $infocord = 2 + (4*$no_of_species) - 1;
1721 $typecord = 2 + (4*$no_of_species) + 1 - 1;
1722 $startcord = 2 + (4*$no_of_species) + 2 - 1;
1723 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
1724 $endcord = 2 + (4*$no_of_species) + 4 - 1;
1725 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
1726 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
1727 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
1728
1729 $interr_poscord = $motifcord + 3;
1730 $no_of_interruptionscord = $motifcord + 4;
1731 $interrcord = $motifcord + 2;
1732 $interrtypecord = $motifcord + 1;
1733
1734
1735 open(IN,"<$input1") or die "Cannot open file $input1 $!";
1736 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
1737
1738 open(INT,">$output1") or die "Cannot open file $output2 $!";
1739 open(COMP,">$output2") or die "Cannot open file $output2 $!";
1740 #open(CH,">changed") or die "Cannot open file changed $!";
1741
1742 # print "opened files \n";
1743 my $linecounter = 0;
1744 my $microcounter = 0;
1745
1746 my %micros = ();
1747 while (my $line = <IN>){
1748 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
1749 $linecounter++;
1750 if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
1751 my $key = join("\t",$1, $2, $3, $4);
1752 # print $key, "#-#-#-#-#-#-#-#\n";
1753 # print "key = $key\n";
1754 push (@{$micros{$key}},$line);
1755 $microcounter++;
1756 }
1757 else {#print $line," key not made\n"; <STDIN>;
1758 }
1759 }
1760 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
1761 close IN;
1762 my @deletedlines = ();
1763 # print "done hash \n";
1764 $linecounter = 0;
1765 #---------------------------------------------------------------------------------------------------
1766 # NOW READING THE SEQUENCE FILE
1767 my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide);
1768 $printer = 0;
1769
1770 while(my $sine = <SEQ>){
1771 my %microstart=();
1772 my %microend=();
1773 my @sields = split(/\t/,$sine);
1774 my $key = ();
1775
1776 # print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>;
1777
1778 if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
1779
1780 # if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) {
1781 $key = join("\t",$1, $2, $3, $4);
1782 # print "key = $key\n";
1783 }
1784 else{
1785 # print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n";
1786 }
1787
1788 if (exists $micros{$key}){
1789 my @microstring = @{$micros{$key}};
1790 delete $micros{$key};
1791
1792 foreach my $line (@microstring){
1793 # print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1;
1794 # print "microsat = $line" if $printer == 1;
1795 $linecounter++;
1796 my $copy_line = $line;
1797 my @mields = split(/\t/,$line);
1798 my @fields = @mields;
1799 my $start = $fields[$startcord];
1800 my $end = $fields[$endcord];
1801 my $microsat = $fields[$microsatcord];
1802 my $motifline = $fields[$motifcord];
1803 my $microsatcopy = $microsat;
1804 my $positioner = $microsat;
1805 $positioner =~ s/[a-zA-Z|-]/_/g;
1806 $microsatcopy =~ s/^\[|\]$//gs;
1807 chomp $microsatcopy;
1808 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
1809 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
1810 my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start);
1811 # print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1;
1812 shift @inields;
1813 #print "inields =@inields<\n";
1814 $motifline =~ s/^\[|\]$//gs;
1815 chomp $motifline;
1816 #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n";
1817 my @motields = split(/\]\[/,$motifline);
1818 my $seq = $microsatcopy;
1819 $seq =~ s/\[|\]//g;
1820 my $seqlen = length($seq);
1821 $seq = " ".$seq;
1822
1823 my $longestmotif_no = longest_array_element(@motields);
1824 my $shortestmotif_no = shortest_array_element(@motields);
1825 #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n";
1826
1827 my $search = $motields[$longestmotif_no].$motields[$longestmotif_no];
1828 if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){
1829 print COMP $line;
1830 next;
1831 }
1832
1833 my @shortestmotif_nos = ();
1834 for my $m (0 ... $#motields){
1835 push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) );
1836 }
1837 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
1838 my $newleft =();
1839 my $leftstopper = 0; my $rightstopper = 0;
1840 foreach my $shortmotif_no (@shortestmotif_nos){
1841 next if $shortmotif_no == 0;
1842 my $last_left = $shortmotif_no; #$#motields;
1843 my $last_hitter = 0;
1844 for (my $i =($shortmotif_no-1); $i>=0; $i--){
1845 my $search = $motields[$shortmotif_no];
1846 if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
1847 if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
1848 $last_hitter = 1;
1849 $last_left = $i+1; last;
1850 }
1851 my $probe = $motields[$i];
1852 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
1853
1854 if ($probe !~ /$search/){
1855 $last_hitter = 1;
1856 $last_left = $i+1;
1857 # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n";
1858 last;
1859 }
1860 $last_left--;$last_hitter = 1;
1861 # print "passed tests, last left = $last_left\n";
1862 }
1863 # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n";
1864 if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;}
1865 else {$leftstopper = 1;
1866 # print "leftstopper = 1\n";
1867 }
1868 }
1869
1870 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
1871 my $newright =();
1872 foreach my $shortmotif_no (@shortestmotif_nos){
1873 next if $shortmotif_no == $#motields;
1874 my $last_right = $shortmotif_no;# -1;
1875 for my $i ($shortmotif_no+1 ... $#motields){
1876 my $search = $motields[$shortmotif_no];
1877 if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
1878 if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
1879 $last_right = $i-1; last;
1880 }
1881 my $probe = $motields[$i];
1882 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
1883 if ( $probe !~ /$search/){
1884 $last_right = $i-1; last;
1885 }
1886 $last_right++;
1887 }
1888 if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n";
1889 }
1890 else {$rightstopper = 1;
1891 }
1892 }
1893
1894
1895 if ($rightstopper == 1 && $leftstopper == 1){
1896 print COMP $line;
1897 # print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1;
1898 next;
1899 }
1900
1901 # print "pased initial testing phase \n" if $printer == 1;
1902 my @outputs = ();
1903 my @orig_starts = ();
1904 my @orig_ends = ();
1905 for my $mic (0 ... $#microields){
1906 my $miclen = length($microields[$mic]);
1907 my $microleftlen = 0;
1908 #print "\nmic = $mic\n";
1909 if($mic > 0){
1910 for my $submin (0 ... $mic-1){
1911 my $interval = ();
1912 if (!exists $inields[$submin]) {$interval = "";}
1913 else {$interval = $inields[$submin];}
1914 #print "inield =$interval< and microield =$microields[$submin]<\n ";
1915 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
1916 }
1917 }
1918 push(@orig_starts,($microleftlen+1));
1919 push(@orig_ends, ($microleftlen+1 + $miclen -1));
1920 }
1921
1922 ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
1923
1924
1925 for my $mic (0 ... $#microields){
1926 my $miclen = length($microields[$mic]);
1927 my $microleftlen = 0;
1928 if($mic > 0){
1929 for my $submin (0 ... $mic-1){
1930 # if(!exists $inields[$submin]) {$inields[$submin] = "";}
1931 my $interval = ();
1932 if (!exists $inields[$submin]) {$interval = "";}
1933 else {$interval = $inields[$submin];}
1934 #print "inield =$interval< and microield =$microields[$submin]<\n ";
1935 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
1936 }
1937 }
1938 $fields[$startcord] = $microleftlen+1;
1939 $fields[$endcord] = $fields[$startcord] + $miclen -1;
1940 $fields[$typecord] = $microsat_types[length($motields[$mic])];
1941 $fields[$microsatcord] = $microields[$mic];
1942 $fields[$motifcord] = $motields[$mic];
1943 my $templine = join("\t", (@fields[0 .. $motifcord]) );
1944 my $orig_templine = join("\t", (@fields[0 .. $motifcord]) );
1945 my $newline;
1946 my $lefter = 1; my $righter = 1;
1947 if ( $fields[$startcord] < 2){$lefter = 0;}
1948 if ($fields[$endcord] == $seqlen){$righter = 0;}
1949
1950 while($lefter == 1){
1951 $newline = left_extender($templine, $seq,$org);
1952 # print "returned line from left extender= $newline \n" if $printer == 1;
1953 if ($newline eq $templine){$templine = $newline; last;}
1954 else {$templine = $newline;}
1955
1956 if (left_extention_permission_giver($templine) eq "no") {last;}
1957 }
1958 while($righter == 1){
1959 $newline = right_extender($templine, $seq,$org);
1960 # print "returned line from right extender= $newline \n" if $printer == 1;
1961 if ($newline eq $templine){$templine = $newline; last;}
1962 else {$templine = $newline;}
1963 if (right_extention_permission_giver($templine) eq "no") {last;}
1964 }
1965 my @tempfields = split(/\t/,$templine);
1966 $tempfields[$microsatcord] =~ s/\]|\[//g;
1967 $tempfields[$motifcord] =~ s/^\[|\]$//gs;
1968 my @tempmotields = split(/\]\[/,$tempfields[$motifcord]);
1969
1970 if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) {
1971 # print "scalar ( tempmotields) = 1\n" if $printer == 1;
1972 next;
1973 }
1974 my $prevmotif = shift(@tempmotields);
1975 my $stopper = 0;
1976
1977 foreach my $tempmot (@tempmotields){
1978 if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;}
1979 my $search = $prevmotif.$prevmotif;
1980 if ($search !~ /$tempmot/) {$stopper = 1; last;}
1981 $prevmotif = $tempmot;
1982 }
1983 if ( $stopper == 1) {
1984 # print "length tempmot != length prevmotif\n" if $printer == 1;
1985 next;
1986 }
1987 my $lastend = 0;
1988 #----------------------------------------------------------
1989 my $left_captured = (); my $right_captured = ();
1990 my $left_bp = (); my $right_bp = ();
1991 # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n";
1992 for my $o (0 ... $#orig_starts){
1993 # print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1;
1994 # print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1;
1995
1996 if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord])
1997 # print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1;
1998 $left_captured = $o;
1999 $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1;
2000 }
2001 elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord])
2002 # print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1;
2003 $right_captured = $o;
2004 $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1;
2005 }
2006 $lastend = $orig_ends[$o]
2007 }
2008 # print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1;
2009 my $leftmotif = (); my $left_trashed = ();
2010 if ($tempfields[$startcord] != $fields[$startcord]) {
2011 $leftmotif = $motields[$left_captured];
2012 # print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1;
2013 if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n";
2014 }
2015 $left_trashed = length($microields[$left_captured]) - $left_bp;
2016 }
2017 my $rightmotif = (); my $right_trashed = ();
2018 if ($tempfields[$endcord] != $fields[$endcord]) {
2019 # print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1;
2020 $rightmotif = $motields[$right_captured];
2021 $right_trashed = length($microields[$right_captured]) - $right_bp;
2022 }
2023
2024 ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
2025 $stopper = 0;
2026 my $deletioner = 0;
2027 #if($tempfields[$startcord] != $fields[$startcord]){
2028 # print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1;
2029 if ($left_captured != 0){
2030 # print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1;
2031 for my $e (0 ... $left_captured-1){
2032 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){
2033 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2034 $deletioner++; last;
2035 }
2036 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
2037 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2038 $deletioner++; last;
2039 }
2040 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
2041 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2042 $deletioner++; last;
2043 }
2044 }
2045 }
2046 #}
2047 # print "after left search, deletioner = $deletioner\n" if $printer == 1;
2048 if ($deletioner >= 1) {
2049 # print "deletioner = $deletioner\n" if $printer == 1;
2050 next;
2051 }
2052
2053 $deletioner = 0;
2054
2055 #if($tempfields[$endcord] != $fields[$endcord]){
2056 # print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1;
2057 if ($right_captured != $#microields){
2058 # print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1;
2059 for my $e ($right_captured+1 ... $#microields){
2060 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){
2061 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2062 $deletioner++; last;
2063 }
2064 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
2065 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2066 $deletioner++; last;
2067 }
2068 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
2069 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
2070 $deletioner++; last;
2071 }
2072 }
2073 }
2074 #}
2075 # print "deletioner = $deletioner\n" if $printer == 1;
2076 if ($deletioner >= 1) {
2077 next;
2078 }
2079 my $leftMotifs_notCaptured = ();
2080 my $rightMotifs_notCaptured = ();
2081
2082 if ($tempfields[$startcord] != $fields[$startcord] ){
2083 #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n";
2084 if (length($leftmotif) == 1 && $left_trashed > 3){
2085 # print "invaded left motif is long mononucleotide" if $printer == 1;
2086 next;
2087
2088 }
2089 elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){
2090 # print "invaded left motif too long" if $printer == 1;
2091 next;
2092 }
2093 }
2094 if ($tempfields[$endcord] != $fields[$endcord] ){
2095 #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n";
2096 if (length($rightmotif)==1 && $right_trashed){
2097 # print "invaded right motif is long mononucleotide" if $printer == 1;
2098 next;
2099
2100 }
2101 elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){
2102 # print "invaded right motif too long" if $printer == 1;
2103 next;
2104
2105 }
2106 }
2107 push @outputs, $templine;
2108 }
2109 if (scalar(@outputs) == 0){ print COMP $line; next;}
2110 # print "outputs are:", join("\n",@outputs),"\n";
2111 if (scalar(@outputs) == 1){
2112 my @oields = split(/\t/,$outputs[0]);
2113 my $start = $oields[$startcord]+$mields[$startcord]-1;
2114 my $end = $start+($oields[$endcord]-$oields[$startcord]);
2115 $oields[$startcord] = $start; $oields[$endcord] = $end;
2116 print INT join("\t",@oields), "\n";
2117 # print CH $line,;
2118 }
2119 if (scalar(@outputs) > 1){
2120 my $motif_min = 10;
2121 my $chosen_one = $outputs[0];
2122 foreach my $micro (@outputs){
2123 my @oields = split(/\t/,$micro);
2124 my $tempmotif = $oields[$motifcord];
2125 $tempmotif =~ s/^\[|\]$//gs;
2126 my @omots = split(/\]\[/, $tempmotif);
2127 # print "motif_min = $motif_min, current motif = $tempmotif\n";
2128 my $start = $oields[$startcord]+$mields[$startcord]-1;
2129 my $end = $start+($oields[$endcord]-$oields[$startcord]);
2130 $oields[$startcord] = $start; $oields[$endcord] = $end;
2131 if(length($omots[0]) < $motif_min) {
2132 $chosen_one = join("\t",@oields);
2133 $motif_min = length($omots[0]);
2134 }
2135 }
2136 print INT $chosen_one, "\n";
2137 # print "chosen one is ".$chosen_one, "\n";
2138 # print CH $line;
2139
2140
2141 }
2142
2143 }
2144
2145 } #if (exists $micros{$key}){
2146 else{
2147 }
2148 }
2149 close INT;
2150 close COMP;
2151 }
2152 sub left_extender{
2153 #print "left extender\n";
2154 my ($line, $seq, $org) = @_;
2155 # print "in left extender... line passed = $line and sequence is $seq\n";
2156 chomp $line;
2157 my @fields = split(/\t/,$line);
2158 my $rstart = $fields[$startcord];
2159 my $microsat = $fields[$microsatcord];
2160 $microsat =~ s/\[|\]//g;
2161 my $rend = $rstart + length($microsat)-1;
2162 $microsat =~ s/-//g;
2163 my $motif = $fields[$motifcord];
2164 my $firstmotif = ();
2165
2166 if ($motif =~ /^\[/){
2167 $motif =~ s/^\[//g;
2168 $motif =~ /([a-zA-Z]+)\].*/;
2169 $firstmotif = $1;
2170 }
2171 else {$firstmotif = $motif;}
2172
2173 #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n";
2174 my $leftphase = substr($microsat, 0,length($firstmotif));
2175 my $phaser = $leftphase.$leftphase;
2176 my @phase = split(/\s*/,$leftphase);
2177 my @phases;
2178 my @copy_phases = @phases;
2179 my $crawler=0;
2180 for (0 ... (length($leftphase)-1)){
2181 push(@phases, substr($phaser, $crawler, length($leftphase)));
2182 $crawler++;
2183 }
2184
2185 my $start = $rstart;
2186 my $end = $rend;
2187
2188 my $leftseq = substr($seq, 0, $start);
2189 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n";
2190 my @extentions = ();
2191 my @trappeds = ();
2192 my @intervalposs = ();
2193 my @trappedposs = ();
2194 my @trappedphases = ();
2195 my @intervals = ();
2196 my $firstmotif_length = length($firstmotif);
2197 foreach my $phase (@phases){
2198 # print "left phase\t",substr($leftseq, -10),"\t$phase\n";
2199 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n";
2200 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
2201 # print "in left pattern\n";
2202 my $trapped = $1;
2203 my $trappedpos = length($leftseq)-length($trapped);
2204 my $interval = $3;
2205 my $intervalpos = index($trapped, $interval) + 1;
2206 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n";
2207
2208 my $extention = substr($trapped, 0, length($trapped)-length($interval));
2209 my $leftpeep = substr($seq, 0, ($start-length($trapped)));
2210 my @passed_overhangs;
2211
2212 for my $i (1 ... length($phase)-1){
2213 my $overhang = substr($phase, -length($phase)+$i);
2214 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n";
2215 #TEMPORARY... BETTER METHOD NEEDED
2216 $leftpeep =~ s/-//g;
2217 if ($leftpeep =~ /$overhang$/i){
2218 push(@passed_overhangs,$overhang);
2219 # print "l overhang\n";
2220 }
2221 }
2222
2223 if(scalar(@passed_overhangs)>0){
2224 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
2225 $extention = $overhang.$extention;
2226 $trapped = $overhang.$trapped;
2227 #print "trapped extended to $trapped \n";
2228 $trappedpos = length($leftseq)-length($trapped);
2229 }
2230
2231 push(@extentions,$extention);
2232 # print "extentions = @extentions \n";
2233
2234 push(@trappeds,$trapped );
2235 push(@intervalposs,length($extention)+1);
2236 push(@trappedposs, $trappedpos);
2237 # print "trappeds = @trappeds\n";
2238 push(@trappedphases, substr($extention,0,length($phase)));
2239 push(@intervals, $interval);
2240 }
2241 }
2242 if (scalar(@trappeds == 0)) {return $line;}
2243
2244 my $nikaal = shortest_array_element(@intervals);
2245
2246 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
2247 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
2248 ##print "new fields 9 = $fields[9]\n";
2249 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
2250
2251 if($fields[$microsatcord] !~ /^\[/i){
2252 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
2253 }
2254
2255 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
2256
2257 if (exists ($fields[$motifcord+1])){
2258 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
2259 }
2260 else{$fields[$motifcord+1] = "indel/deletion";}
2261 ##print "new fields 14 = $fields[14]\n";
2262
2263 if (exists ($fields[$motifcord+2])){
2264 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
2265 }
2266 else{$fields[$motifcord+2] = $intervals[$nikaal];}
2267 my @seventeen=();
2268 if (exists ($fields[$motifcord+3])){
2269 @seventeen = split(/,/,$fields[$motifcord+3]);
2270 # #print "scalarseventeen =@seventeen<-\n";
2271 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
2272 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
2273 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
2274 }
2275
2276 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
2277
2278 ##print "new fields 16 = $fields[16]\n";
2279 ##print "new fields 17 = $fields[17]\n";
2280
2281
2282 my $returnline = join("\t",@fields);
2283 my $pastline = $returnline;
2284 if ($fields[$microsatcord] =~ /\[/){
2285 $returnline = multiSpecies_compoundClarifyer_merge($returnline);
2286 }
2287 return $returnline;
2288 }
2289 sub right_extender{
2290 my ($line, $seq, $org) = @_;
2291 chomp $line;
2292 my @fields = split(/\t/,$line);
2293 my $rstart = $fields[$startcord];
2294 my $microsat = $fields[$microsatcord];
2295 $microsat =~ s/\[|\]//g;
2296 my $rend = $rstart + length($microsat)-1;
2297 $microsat =~ s/-//g;
2298 my $motif = $fields[$motifcord];
2299 my $temp_lastmotif = ();
2300
2301 if ($motif =~ /\]$/s){
2302 $motif =~ s/\]$//sg;
2303 $motif =~ /.*\[([a-zA-Z]+)/;
2304 $temp_lastmotif = $1;
2305 }
2306 else {$temp_lastmotif = $motif;}
2307 my $lastmotif = substr($microsat,-length($temp_lastmotif));
2308 ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n";
2309 my $rightphase = substr($microsat, -length($lastmotif));
2310 my $phaser = $rightphase.$rightphase;
2311 my @phase = split(/\s*/,$rightphase);
2312 my @phases;
2313 my @copy_phases = @phases;
2314 my $crawler=0;
2315 for (0 ... (length($rightphase)-1)){
2316 push(@phases, substr($phaser, $crawler, length($rightphase)));
2317 $crawler++;
2318 }
2319
2320 my $start = $rstart;
2321 my $end = $rend;
2322
2323 my $rightseq = substr($seq, $end+1);
2324 my @extentions = ();
2325 my @trappeds = ();
2326 my @intervalposs = ();
2327 my @trappedposs = ();
2328 my @trappedphases = ();
2329 my @intervals = ();
2330 my $lastmotif_length = length($lastmotif);
2331 foreach my $phase (@phases){
2332 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
2333 my $trapped = $1;
2334 my $trappedpos = $end+1;
2335 my $interval = $2;
2336 my $intervalpos = index($trapped, $interval) + 1;
2337
2338 my $extention = substr($trapped, length($interval));
2339 my $rightpeep = substr($seq, ($end+length($trapped))+1);
2340 my @passed_overhangs = "";
2341
2342 #TEMPORARY... BETTER METHOD NEEDED
2343 $rightpeep =~ s/-//g;
2344
2345 for my $i (1 ... length($phase)-1){
2346 my $overhang = substr($phase,0, $i);
2347 # #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n";
2348 if ($rightpeep =~ /^$overhang/i){
2349 push(@passed_overhangs, $overhang);
2350 # #print "r overhang\n";
2351 }
2352 }
2353 if (scalar(@passed_overhangs) > 0){
2354 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
2355 $extention = $extention.$overhang;
2356 $trapped = $trapped.$overhang;
2357 # #print "trapped extended to $trapped \n";
2358 }
2359
2360 push(@extentions,$extention);
2361 ##print "extentions = @extentions \n";
2362
2363 push(@trappeds,$trapped );
2364 push(@intervalposs,$intervalpos);
2365 push(@trappedposs, $trappedpos);
2366 # #print "trappeds = @trappeds\n";
2367 push(@trappedphases, substr($extention,0,length($phase)));
2368 push(@intervals, $interval);
2369 }
2370 }
2371 if (scalar(@trappeds == 0)) {return $line;}
2372
2373 # my $nikaal = longest_array_element(@trappeds);
2374 my $nikaal = shortest_array_element(@intervals);
2375
2376 # #print "longest element found = $nikaal \n";
2377
2378 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
2379 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
2380 ##print "new fields 9 = $fields[9]";
2381 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
2382
2383 ##print "new fields 11 = $fields[11]\n";
2384
2385 if($fields[$microsatcord] !~ /^\[/i){
2386 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
2387 }
2388
2389 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
2390 ##print "new fields 12 = $fields[12]\n";
2391
2392 ##print "scalar of fields = ",scalar(@fields),"\n";
2393 if (exists ($fields[$motifcord+1])){
2394 # print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1];
2395 # <STDIN> if !exists $fields[$motifcord+1];
2396 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
2397 }
2398 else{$fields[$motifcord+1] = "indel/deletion";}
2399 ##print "new fields 14 = $fields[14]\n";
2400
2401 if (exists ($fields[$motifcord+2])){
2402 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
2403 }
2404 else{$fields[$motifcord+2] = $intervals[$nikaal];}
2405 ##print "new fields 15 = $fields[15]\n";
2406
2407 my @seventeen=();
2408 if (exists ($fields[$motifcord+3])){
2409 ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n";
2410 # print " print fields = @fields\n " if !exists $fields[$motifcord+3];
2411 <STDIN> if !exists $fields[$motifcord+3];
2412 my $currpos = length($microsat)+$intervalposs[$nikaal];
2413 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
2414 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
2415
2416 }
2417
2418 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
2419
2420 ##print "new fields 16 = $fields[16]\n";
2421
2422 ##print "new fields 17 = $fields[17]\n";
2423 my $returnline = join("\t",@fields);
2424 my $pastline = $returnline;
2425 if ($fields[$microsatcord] =~ /\[/){
2426 $returnline = multiSpecies_compoundClarifyer_merge($returnline);
2427 }
2428 #print "finally right-extended line = ",$returnline,"\n";
2429 return $returnline;
2430 }
2431 sub longest_array_element{
2432 my $counter = 0;
2433 my($max) = shift(@_);
2434 my $maxcounter = 0;
2435 foreach my $temp (@_) {
2436 $counter++;
2437 #print "finding largest array: $maxcounter \n" if $prinkter == 1;
2438 if(length($temp) > length($max)){
2439 $max = $temp;
2440 $maxcounter = $counter;
2441 }
2442 }
2443 return($maxcounter);
2444 }
2445 sub shortest_array_element{
2446 my $counter = 0;
2447 my($min) = shift(@_);
2448 my $mincounter = 0;
2449 foreach my $temp (@_) {
2450 $counter++;
2451 #print "finding largest array: $mincounter \n" if $prinkter == 1;
2452 if(length($temp) < length($min)){
2453 $min = $temp;
2454 $mincounter = $counter;
2455 }
2456 }
2457 return($mincounter);
2458 }
2459
2460
2461 sub left_extention_permission_giver{
2462 my @fields = split(/\t/,$_[0]);
2463 my $microsat = $fields[$microsatcord];
2464 $microsat =~ s/(^\[)|-//g;
2465 my $motif = $fields[$motifcord];
2466 my $firstmotif = ();
2467 my $firststretch = ();
2468 my @stretches=();
2469 if ($motif =~ /^\[/){
2470 $motif =~ s/^\[//g;
2471 $motif =~ /([a-zA-Z]+)\].*/;
2472 $firstmotif = $1;
2473 @stretches = split(/\]/,$microsat);
2474 $firststretch = $stretches[0];
2475 ##print "firststretch = $firststretch\n";
2476 }
2477 else {$firstmotif = $motif;$firststretch = $microsat;}
2478
2479 if (length($firststretch) < $thresholds[length($firstmotif)]){
2480 return "no";
2481 }
2482 else {return "yes";}
2483
2484 }
2485 sub right_extention_permission_giver{
2486 my @fields = split(/\t/,$_[0]);
2487 my $microsat = $fields[$microsatcord];
2488 $microsat =~ s/-|(\]$)//sg;
2489 my $motif = $fields[$motifcord];
2490 my $temp_lastmotif = ();
2491 my $laststretch = ();
2492 my @stretches=();
2493
2494
2495 if ($motif =~ /\]/){
2496 $motif =~ s/\]$//gs;
2497 $motif =~ /.*\[([a-zA-Z]+)$/;
2498 $temp_lastmotif = $1;
2499 @stretches = split(/\[/,$microsat);
2500 $laststretch = pop(@stretches);
2501 ##print "last stretch = $laststretch\n";
2502 }
2503 else {$temp_lastmotif = $motif; $laststretch = $microsat;}
2504
2505 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
2506 return "no";
2507 }
2508 else { return "yes";}
2509
2510
2511 }
2512 sub multiSpecies_compoundClarifyer_merge{
2513 my $line = $_[0];
2514 #print "sent for mering: $line \n";
2515 my @mields = split(/\t/,$line);
2516 my @fields = @mields;
2517 my $microsat = $fields[$microsatcord];
2518 my $motifline = $fields[$motifcord];
2519 my $microsatcopy = $microsat;
2520 $microsatcopy =~ s/^\[|\]$//sg;
2521 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
2522 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
2523 shift @inields;
2524 #print "inields =@inields<\n";
2525 $motifline =~ s/^\[|\]$//sg;
2526 my @motields = split(/\]\[/,$motifline);
2527 my @firstmotifs = ();
2528 my @lastmotifs = ();
2529 for my $i (0 ... $#microields){
2530 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
2531 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
2532 }
2533 #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n";
2534 my @mergelist = ();
2535 my @inter_poses = split(/,/,$fields[$interr_poscord]);
2536 my $no_of_interruptions = $fields[$no_of_interruptionscord];
2537 my @interruptions = split(/,/,$fields[$interrcord]);
2538 my @interrtypes = split(/,/,$fields[$interrtypecord]);
2539 my $stopper = 0;
2540 for my $i (0 ... $#motields-1){
2541 #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n";
2542 if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){
2543 $stopper = 1;
2544 push(@mergelist, ($i)."_".($i+1));
2545 }
2546 }
2547
2548 return $line if scalar(@mergelist) == 0;
2549
2550 foreach my $merging (@mergelist){
2551 my @sets = split(/_/, $merging);
2552 my @tempmicro = ();
2553 my @tempmot = ();
2554 for my $i (0 ... $sets[0]-1){
2555 push(@tempmicro, "[".$microields[$i]."]");
2556 push(@tempmicro, $inields[$i]);
2557 push(@tempmot, "[".$motields[$i]."]");
2558 #print "adding pre-motifs number $i\n";
2559 }
2560 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
2561 push (@tempmicro, $pusher);
2562 push(@tempmot, "[".$motields[$sets[0]]."]");
2563 my $outcoming = -2;
2564 for my $i ($sets[1]+1 ... $#microields-1){
2565 push(@tempmicro, "[".$microields[$i]."]");
2566 push(@tempmicro, $inields[$i]);
2567 push(@tempmot, "[".$motields[$i]."]");
2568 #print "adding post-motifs number $i\n";
2569 $outcoming = $i;
2570 }
2571 if ($outcoming != -2){
2572 #print "outcoming = $outcoming \n";
2573 push(@tempmicro, "[".$microields[$outcoming+1 ]."]");
2574 push(@tempmot,"[". $motields[$outcoming+1]."]");
2575 }
2576 $fields[$microsatcord] = join("",@tempmicro);
2577 $fields[$motifcord] = join("",@tempmot);
2578
2579 splice(@interrtypes, $sets[0], 1);
2580 $fields[$interrtypecord] = join(",",@interrtypes);
2581 splice(@interruptions, $sets[0], 1);
2582 $fields[$interrcord] = join(",",@interruptions);
2583 splice(@inter_poses, $sets[0], 1);
2584 $fields[$interr_poscord] = join(",",@inter_poses);
2585 $no_of_interruptions = $no_of_interruptions - 1;
2586 }
2587
2588 if ($no_of_interruptions == 0){
2589 $fields[$microsatcord] =~ s/^\[|\]$//sg;
2590 $fields[$motifcord] =~ s/^\[|\]$//sg;
2591 $line = join("\t", @fields[0 ... $motifcord]);
2592 }
2593 else{
2594 $line = join("\t", @fields);
2595 }
2596 return $line;
2597 }
2598
2599 sub thrashallow{
2600 my $motif = $_[0];
2601 return 4 if length($motif) == 2;
2602 return 6 if length($motif) == 3;
2603 return 8 if length($motif) == 4;
2604
2605 }
2606
2607 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
2608
2609
2610 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
2611 sub multispecies_filtering_compound_microsats{
2612 my $unfiltered = $_[0];
2613 my $filtered = $_[1];
2614 my $residue = $_[2];
2615 my $no_of_species = $_[5];
2616 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
2617 open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
2618 open(RES,">$residue") or die "Cannot open file $residue: $!";
2619
2620 $infocord = 2 + (4*$no_of_species) - 1;
2621 $startcord = 2 + (4*$no_of_species) + 2 - 1;
2622 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
2623 $endcord = 2 + (4*$no_of_species) + 4 - 1;
2624 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
2625 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
2626
2627 my @sub_thresholds = ("0");
2628 push(@sub_thresholds, split(/_/,$_[3]));
2629 my @thresholds = ("0");
2630 push(@thresholds, split(/_/,$_[4]));
2631
2632 while (my $line = <UNF>) {
2633 if ($line !~ /compound/){
2634 print FIL $line,"\n"; next;
2635 }
2636 chomp $line;
2637 my @fields = split(/\t/,$line);
2638 my $motifline = $fields[$motifcord];
2639 $motifline =~ s/^\[|\]$//g;
2640 my @motifs = split(/\]\[/,$motifline);
2641 my $microsat = $fields[$microsatcord];
2642 $microsat =~ s/^\[|\]$|-//g;
2643 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
2644
2645 my $stopper = 0;
2646 for my $i (0 ... $#motifs){
2647 my @common = ();
2648 my $probe = $motifs[$i].$motifs[$i];
2649 my $motif_size = length($motifs[$i]);
2650
2651 for my $j (0 ... $#motifs){
2652 next if length($motifs[$i]) != length($motifs[$j]);
2653 push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i;
2654 }
2655
2656 if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;}
2657 else {next;}
2658 }
2659
2660 if ($stopper == 1){
2661 print RES $line,"\n";
2662 }
2663 else { print FIL $line,"\n"; }
2664 }
2665 close FIL;
2666 close RES;
2667 }
2668
2669 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
2670
2671
2672 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
2673
2674 sub chromosome_unrand_breaker{
2675 # print "IN chromosome_unrand_breaker: @_\n ";
2676 my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
2677 my $dir = $_[1]; ###### directory where subsets are put
2678 my $output2 = $_[2]; ###### list of subset files
2679 my $increment = $_[3];
2680 my $info = $_[4];
2681 my $chr = $_[5];
2682 open(SEQ,"<$input1") or die "Cannot open file $input1 $!";
2683
2684 open(OUT,">$output2") or die "Cannot open file $output2 $!";
2685
2686 #---------------------------------------------------------------------------------------------------
2687 # NOW READING THE SEQUENCE FILE
2688
2689 my $seed = 0;
2690 my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
2691 print OUT $subset,"\n";
2692 open(SUB,">$subset");
2693
2694 while(my $sine = <SEQ>){
2695 $seed++;
2696 print SUB $sine;
2697
2698 if ($seed%$increment == 0 ){
2699 close SUB;
2700 $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
2701 open(SUB,">$subset");
2702 print SUB $sine;
2703 print OUT $subset,"\n";
2704 # print $subset,"\n";
2705 }
2706 }
2707 close OUT;
2708 close SUB;
2709 }
2710 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
2711
2712
2713 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
2714 sub multiSpecies_interruptedMicrosatHunter{
2715 # print "IN multiSpecies_interruptedMicrosatHunter: @_\n";
2716 my $input1 = $_[0]; ###### the *_sput_op4_ii file
2717 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
2718 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
2719 my $output2 = $_[3]; ###### uninterrupted microsatellite file
2720 my $org = $_[4];
2721 my $no_of_species = $_[5];
2722
2723 my @thresholds = "0";
2724 push(@thresholds, split(/_/,$_[6]));
2725
2726 # print "thresholds = @thresholds \n";
2727 $infocord = 2 + (4*$no_of_species) - 1;
2728 $typecord = 2 + (4*$no_of_species) + 1 - 1;
2729 $startcord = 2 + (4*$no_of_species) + 2 - 1;
2730 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
2731 $endcord = 2 + (4*$no_of_species) + 4 - 1;
2732 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
2733 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
2734 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
2735
2736 $interr_poscord = $motifcord + 3;
2737 $no_of_interruptionscord = $motifcord + 4;
2738 $interrcord = $motifcord + 2;
2739 $interrtypecord = $motifcord + 1;
2740
2741
2742 $prinkter = 0;
2743 # print "prionkytet = $prinkter\n";
2744
2745 open(IN,"<$input1") or die "Cannot open file $input1 $!";
2746 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
2747
2748 open(INT,">$output1") or die "Cannot open file $output2 $!";
2749 open(UNINT,">$output2") or die "Cannot open file $output2 $!";
2750
2751 # print "opened files !!\n";
2752 my $linecounter = 0;
2753 my $microcounter = 0;
2754
2755 my %micros = ();
2756 while (my $line = <IN>){
2757 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
2758 $linecounter++;
2759 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) {
2760 my $key = join("\t",$1, $2, $3, $4, $5);
2761 # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1;
2762 push (@{$micros{$key}},$line);
2763 $microcounter++;
2764 }
2765 else {#print $line if $prinkter == 1;
2766 }
2767 }
2768 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
2769 close IN;
2770 my @deletedlines = ();
2771 # print "done hash \n";
2772 $linecounter = 0;
2773 #---------------------------------------------------------------------------------------------------
2774 # NOW READING THE SEQUENCE FILE
2775 while(my $sine = <SEQ>){
2776 #print $linecounter,"\n" if $linecounter % 1000 == 0;
2777 my %microstart=();
2778 my %microend=();
2779 my @sields = split(/\t/,$sine);
2780 my $key = ();
2781 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
2782 $key = join("\t",$1, $2, $3, $4, $5);
2783 # print $key, "<-<-<-<-<-<-<-<\n";
2784 }
2785
2786 # $prinkter = 1 if $sine =~ /^>H\t499\t/;
2787
2788 if (exists $micros{$key}){
2789 my @microstring = @{$micros{$key}};
2790 delete $micros{$key};
2791 my @filteredmicrostring;
2792 # print "sequence = $sields[$sequencepos]" if $prinkter == 1;
2793 foreach my $line (@microstring){
2794 $linecounter++;
2795 my $copy_line = $line;
2796 my @fields = split(/\t/,$line);
2797 my $start = $fields[$startcord];
2798 my $end = $fields[$endcord];
2799
2800 # print $line if $prinkter == 1;
2801 #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE
2802 my $newline;
2803 while(1){
2804 # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1;
2805 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
2806
2807 $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org);
2808 if ($newline eq $line){$line = $newline; last;}
2809 else {$line = $newline;}
2810
2811 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
2812 # print "returned line from left extender= $line \n" if $prinkter == 1;
2813 }
2814 while(1){
2815 # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1;
2816 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
2817
2818 $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org);
2819 if ($newline eq $line){$line = $newline; last;}
2820 else {$line = $newline;}
2821
2822 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
2823 # print "returned line from right extender= $line \n" if $prinkter == 1;
2824 }
2825 # print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1;
2826
2827 my @tempfields = split(/\t/,$line);
2828 if ($tempfields[$microsatcord] =~ /\[/){
2829 print INT $line,"\n";
2830 }
2831 else{
2832 print UNINT $line,"\n";
2833 }
2834
2835 if ($line =~ /NULL/){ next; }
2836 push(@filteredmicrostring, $line);
2837 push (@{$microstart{$start}},$line);
2838 push (@{$microend{$end}},$line);
2839 }
2840
2841 my $firstflag = 'down';
2842
2843 } #if (exists $micros{$key}){
2844 }
2845 close INT;
2846 close UNINT;
2847 # print "final number of lines = $linecounter\n";
2848 }
2849
2850 sub multiSpecies_interruptedMicrosatHunter_left_extender{
2851 my ($line, $seq, $org) = @_;
2852 # print "left extender, like passed = $line\n" if $prinkter == 1;
2853 # print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1;
2854 chomp $line;
2855 my @fields = split(/\t/,$line);
2856 my $rstart = $fields[$startcord];
2857 my $microsat = $fields[$microsatcord];
2858 $microsat =~ s/\[|\]//g;
2859 my $rend = $rstart + length($microsat)-1;
2860 $microsat =~ s/-//g;
2861 my $motif = $fields[$motifcord];
2862 my $firstmotif = ();
2863
2864 if ($motif =~ /^\[/){
2865 $motif =~ s/^\[//g;
2866 $motif =~ /([a-zA-Z]+)\].*/;
2867 $firstmotif = $1;
2868 }
2869 else {$firstmotif = $motif;}
2870
2871 # print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1;
2872 my $leftphase = substr($microsat, 0,length($firstmotif));
2873 my $phaser = $leftphase.$leftphase;
2874 my @phase = split(/\s*/,$leftphase);
2875 my @phases;
2876 my @copy_phases = @phases;
2877 my $crawler=0;
2878 for (0 ... (length($leftphase)-1)){
2879 push(@phases, substr($phaser, $crawler, length($leftphase)));
2880 $crawler++;
2881 }
2882
2883 my $start = $rstart;
2884 my $end = $rend;
2885
2886 my $leftseq = substr($seq, 0, $start);
2887 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1;
2888 my @extentions = ();
2889 my @trappeds = ();
2890 my @intervalposs = ();
2891 my @trappedposs = ();
2892 my @trappedphases = ();
2893 my @intervals = ();
2894 my $firstmotif_length = length($firstmotif);
2895 foreach my $phase (@phases){
2896 # print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1;
2897 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1;
2898 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
2899 # print "in left pattern\n" if $prinkter == 1;
2900 my $trapped = $1;
2901 my $trappedpos = length($leftseq)-length($trapped);
2902 my $interval = $3;
2903 my $intervalpos = index($trapped, $interval) + 1;
2904 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1;
2905
2906 my $extention = substr($trapped, 0, length($trapped)-length($interval));
2907 my $leftpeep = substr($seq, 0, ($start-length($trapped)));
2908 my @passed_overhangs;
2909
2910 for my $i (1 ... length($phase)-1){
2911 my $overhang = substr($phase, -length($phase)+$i);
2912 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1;
2913 #TEMPORARY... BETTER METHOD NEEDED
2914 $leftpeep =~ s/-//g;
2915 if ($leftpeep =~ /$overhang$/i){
2916 push(@passed_overhangs,$overhang);
2917 # print "l overhang\n" if $prinkter == 1;
2918 }
2919 }
2920
2921 if(scalar(@passed_overhangs)>0){
2922 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
2923 $extention = $overhang.$extention;
2924 $trapped = $overhang.$trapped;
2925 # print "trapped extended to $trapped \n" if $prinkter == 1;
2926 $trappedpos = length($leftseq)-length($trapped);
2927 }
2928
2929 push(@extentions,$extention);
2930 # print "extentions = @extentions \n" if $prinkter == 1;
2931
2932 push(@trappeds,$trapped );
2933 push(@intervalposs,length($extention)+1);
2934 push(@trappedposs, $trappedpos);
2935 # print "trappeds = @trappeds\n" if $prinkter == 1;
2936 push(@trappedphases, substr($extention,0,length($phase)));
2937 push(@intervals, $interval);
2938 }
2939 }
2940 if (scalar(@trappeds == 0)) {return $line;}
2941
2942 ############################ my $nikaal = longest_array_element(@trappeds);
2943 my $nikaal = shortest_array_element(@intervals);
2944
2945 # print "longest element found = $nikaal \n" if $prinkter == 1;
2946
2947 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
2948 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
2949 #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
2950 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
2951
2952 #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
2953
2954 if($fields[$microsatcord] !~ /^\[/i){
2955 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
2956 }
2957
2958 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
2959 #print "new fields 14 = $fields[12]\n" if $prinkter == 1;
2960
2961 #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1;
2962
2963
2964 if (scalar(@fields) > $motifcord+1){
2965 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
2966 }
2967 else{$fields[$motifcord+1] = "indel/deletion";}
2968 #print "new fields 14 = $fields[14]\n" if $prinkter == 1;
2969
2970 if (scalar(@fields)>$motifcord+2){
2971 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
2972 }
2973 else{$fields[$motifcord+2] = $intervals[$nikaal];}
2974 #print "new fields 15 = $fields[15]\n" if $prinkter == 1;
2975
2976 my @seventeen=();
2977
2978 if (scalar(@fields)>$motifcord+3){
2979 @seventeen = split(/,/,$fields[$motifcord+3]);
2980 # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1;
2981 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
2982 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
2983 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
2984 }
2985
2986 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
2987
2988 #print "new fields 16 = $fields[16]\n" if $prinkter == 1;
2989 #print "new fields 17 = $fields[17]\n" if $prinkter == 1;
2990
2991 # return join("\t",@fields);
2992 my $returnline = join("\t",@fields);
2993 my $pastline = $returnline;
2994 if ($fields[$microsatcord] =~ /\[/){
2995 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
2996 }
2997 # print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1;
2998 return $returnline;
2999 }
3000
3001 sub multiSpecies_interruptedMicrosatHunter_right_extender{
3002 # print "right extender\n" if $prinkter == 1;
3003 my ($line, $seq, $org) = @_;
3004 # print "in right extender... line passed = $line\n" if $prinkter == 1;
3005 # print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1;
3006 chomp $line;
3007 my @fields = split(/\t/,$line);
3008 my $rstart = $fields[$startcord];
3009 my $microsat = $fields[$microsatcord];
3010 $microsat =~ s/\[|\]//g;
3011 my $rend = $rstart + length($microsat)-1;
3012 $microsat =~ s/-//g;
3013 my $motif = $fields[$motifcord];
3014 my $temp_lastmotif = ();
3015
3016 if ($motif =~ /\]$/){
3017 $motif =~ s/\]$//g;
3018 $motif =~ /.*\[([a-zA-Z]+)/;
3019 $temp_lastmotif = $1;
3020 }
3021 else {$temp_lastmotif = $motif;}
3022 my $lastmotif = substr($microsat,-length($temp_lastmotif));
3023 # print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1;
3024 my $rightphase = substr($microsat, -length($lastmotif));
3025 my $phaser = $rightphase.$rightphase;
3026 my @phase = split(/\s*/,$rightphase);
3027 my @phases;
3028 my @copy_phases = @phases;
3029 my $crawler=0;
3030 for (0 ... (length($rightphase)-1)){
3031 push(@phases, substr($phaser, $crawler, length($rightphase)));
3032 $crawler++;
3033 }
3034
3035 my $start = $rstart;
3036 my $end = $rend;
3037
3038 my $rightseq = substr($seq, $end+1);
3039 # print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1;
3040 # print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1;
3041 my @extentions = ();
3042 my @trappeds = ();
3043 my @intervalposs = ();
3044 my @trappedposs = ();
3045 my @trappedphases = ();
3046 my @intervals = ();
3047 my $lastmotif_length = length($lastmotif);
3048 foreach my $phase (@phases){
3049 # print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1;
3050 # print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1;
3051 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
3052 # print "in right pattern\n" if $prinkter == 1;
3053 my $trapped = $1;
3054 my $trappedpos = $end+1;
3055 my $interval = $2;
3056 my $intervalpos = index($trapped, $interval) + 1;
3057 # print "trapped = $trapped, interval = $interval\n" if $prinkter == 1;
3058
3059 my $extention = substr($trapped, length($interval));
3060 my $rightpeep = substr($seq, ($end+length($trapped))+1);
3061 my @passed_overhangs = "";
3062
3063 #TEMPORARY... BETTER METHOD NEEDED
3064 $rightpeep =~ s/-//g;
3065
3066 for my $i (1 ... length($phase)-1){
3067 my $overhang = substr($phase,0, $i);
3068 # print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1;
3069 if ($rightpeep =~ /^$overhang/i){
3070 push(@passed_overhangs, $overhang);
3071 # print "r overhang\n" if $prinkter == 1;
3072 }
3073 }
3074 if (scalar(@passed_overhangs) > 0){
3075 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
3076 $extention = $extention.$overhang;
3077 $trapped = $trapped.$overhang;
3078 # print "trapped extended to $trapped \n" if $prinkter == 1;
3079 }
3080
3081 push(@extentions,$extention);
3082 #print "extentions = @extentions \n" if $prinkter == 1;
3083
3084 push(@trappeds,$trapped );
3085 push(@intervalposs,$intervalpos);
3086 push(@trappedposs, $trappedpos);
3087 # print "trappeds = @trappeds\n" if $prinkter == 1;
3088 push(@trappedphases, substr($extention,0,length($phase)));
3089 push(@intervals, $interval);
3090 }
3091 }
3092 if (scalar(@trappeds == 0)) {return $line;}
3093
3094 ################################### my $nikaal = longest_array_element(@trappeds);
3095 my $nikaal = shortest_array_element(@intervals);
3096
3097 # print "longest element found = $nikaal \n" if $prinkter == 1;
3098
3099 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
3100 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
3101 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
3102
3103
3104 if($fields[$microsatcord] !~ /^\[/i){
3105 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
3106 }
3107
3108 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
3109
3110
3111 if (scalar(@fields) > $motifcord+1){
3112 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
3113 }
3114 else{$fields[$motifcord+1] = "indel/deletion";}
3115
3116 if (scalar(@fields)>$motifcord+2){
3117 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
3118 }
3119 else{$fields[$motifcord+2] = $intervals[$nikaal];}
3120
3121 my @seventeen=();
3122 if (scalar(@fields)>$motifcord+3){
3123 #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1;
3124 my $currpos = length($microsat)+$intervalposs[$nikaal];
3125 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
3126 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
3127
3128 }
3129
3130 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
3131
3132 # print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1;
3133 # return join("\t",@fields);
3134
3135 my $returnline = join("\t",@fields);
3136 my $pastline = $returnline;
3137 if ($fields[$microsatcord] =~ /\[/){
3138 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
3139 }
3140 # print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1;
3141 return $returnline;
3142
3143 }
3144
3145 sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{
3146 my @fields = split(/\t/,$_[0]);
3147 my $microsat = $fields[$microsatcord];
3148 $microsat =~ s/(^\[)|-//sg;
3149 my $motif = $fields[$motifcord];
3150 chomp $motif;
3151 # print $motif, "\n" if $motif !~ /^\[/;
3152 my $firstmotif = ();
3153 my $firststretch = ();
3154 my @stretches=();
3155
3156 # print "motif = $motif, microsat = $microsat\n" if $prinkter == 1;
3157 if ($motif =~ /^\[/){
3158 $motif =~ s/^\[//sg;
3159 $motif =~ /([a-zA-Z]+)\].*/;
3160 $firstmotif = $1;
3161 @stretches = split(/\]/,$microsat);
3162 $firststretch = $stretches[0];
3163 #print "firststretch = $firststretch\n" if $prinkter == 1;
3164 }
3165 else {$firstmotif = $motif;$firststretch = $microsat;}
3166 # print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1;
3167 if (length($firststretch) < $thresholds[length($firstmotif)]){
3168 return "no";
3169 }
3170 else {return "yes";}
3171
3172 }
3173 sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{
3174 my @fields = split(/\t/,$_[0]);
3175 my $microsat = $fields[$microsatcord];
3176 $microsat =~ s/-|(\]$)//sg;
3177 my $motif = $fields[$motifcord];
3178 chomp $motif;
3179 my $temp_lastmotif = ();
3180 my $laststretch = ();
3181 my @stretches=();
3182
3183
3184 if ($motif =~ /\]/){
3185 $motif =~ s/\]$//sg;
3186 $motif =~ /.*\[([a-zA-Z]+)$/;
3187 $temp_lastmotif = $1;
3188 @stretches = split(/\[/,$microsat);
3189 $laststretch = pop(@stretches);
3190 #print "last stretch = $laststretch\n" if $prinkter == 1;
3191 }
3192 else {$temp_lastmotif = $motif; $laststretch = $microsat;}
3193
3194 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
3195 return "no";
3196 }
3197 else { return "yes";}
3198
3199
3200 }
3201 sub checking_substitutions{
3202
3203 my ($line, $seq, $startprobes, $endprobes) = @_;
3204 #print "sequence = $seq \n" if $prinkter == 1;
3205 #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n";
3206 # <STDIN>;
3207 my @seqarray = split(/\s*/,$seq);
3208 my @startsubst_probes = split(/\|/,$startprobes);
3209 my @endsubst_probes = split(/\|/,$endprobes);
3210 chomp $line;
3211 my @fields = split(/\t/,$line);
3212 my $start = $fields[11] - $fields[10];
3213 my $end = $fields[13] - $fields[10];
3214 my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9]
3215 $motif =~ s/\[|\]//g;
3216 my $microsat = $fields[14];
3217 $microsat =~ s/\[|\]//g;
3218 #------------------------------------------------------------------------
3219 # GETTING START AND END PHASES
3220 my $startphase = substr($microsat,0, length($motif));
3221 my $endphase = substr($microsat,-length($motif), length($motif));
3222 #print "start and end phases are - $startphase and $endphase\n";
3223 my $startflag = 'down';
3224 my $endflag = 'down';
3225 my $substitution_distance = length($motif);
3226 my $prestart = $start - $substitution_distance;
3227 my $postend = $end + $substitution_distance;
3228 my @endadds = ();
3229 my @startadds = ();
3230 if (($prestart < 0) || ($postend > scalar(@seqarray))) {
3231 last;
3232 }
3233 #------------------------------------------------------------------------#------------------------------------------------------------------------
3234 # CHECKING FOR SUBSTITUTION PROBES NOW
3235
3236 if ($fields[8] ne "mononucleotide"){
3237 while ($startflag eq "down"){
3238 my $search = join("",@seqarray[$prestart...($start-1)]);
3239 #print "search is from $prestart...($start-1) = $search\n";
3240 foreach my $probe (@startsubst_probes){
3241 #print "\t\tprobe = $probe\n";
3242 if ($search =~ /^$probe/){
3243 #print "\tfound addition to the left - $search \n";
3244 my $copyprobe = $probe;
3245 my $type;
3246 my $subspos = 0;
3247 my $interruption = "";
3248 if ($search eq $startphase) { $type = "NONE";}
3249 else{
3250 $copyprobe =~ s/\[a-zA-Z\]/^/g;
3251 $subspos = index($copyprobe,"^") + 1;
3252 $type = "substitution";
3253 $interruption = substr($search, $subspos,1);
3254 }
3255 my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos);
3256 #print "adding information: $addinfo \n";
3257 push(@startadds, $addinfo);
3258 $prestart = $prestart - $substitution_distance;
3259 $start = $start-$substitution_distance;
3260 $startflag = 'down';
3261
3262 last;
3263 }
3264 else{
3265 $startflag = 'up';
3266 }
3267 }
3268 }
3269 #<STDIN>;
3270 while ($endflag eq "down"){
3271 my $search = join("",@seqarray[($end+1)...$postend]);
3272 #print "search is from ($end+1)...$postend] = $search\n";
3273
3274 foreach my $probe (@endsubst_probes){
3275 #print "\t\tprobe = $probe\n";
3276 if ($search =~ /$probe$/){
3277 my $copyprobe = $probe;
3278 my $type;
3279 my $subspos = 0;
3280 my $interruption = "";
3281 if ($search eq $endphase) { $type = "NONE";}
3282 else{
3283 $copyprobe =~ s/\[a-zA-Z\]/^/g;
3284 $subspos = index($copyprobe,"^") + 1;
3285 $type = "substitution";
3286 $interruption = substr($search, $subspos,1);
3287 }
3288 my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos);
3289 #print "adding information: $addinfo \n";
3290 push(@endadds, $addinfo);
3291 $postend = $postend + $substitution_distance;
3292 $end = $end+$substitution_distance;
3293 push(@endadds, $search);
3294 $endflag = 'down';
3295 last;
3296 }
3297 else{
3298 $endflag = 'up';
3299 }
3300 }
3301 }
3302 #print "startadds = @startadds, endadds = @endadds \n";
3303
3304 }
3305 }
3306 sub microsat_packer{
3307 my $microsat = $_[0];
3308 my $addition = $_[1];
3309
3310
3311
3312 }
3313 sub multiSpecies_interruptedMicrosatHunter_merge{
3314 $prinkter = 0;
3315 # print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n";
3316 my $line = $_[0];
3317 # print "sent for mering: $line \n" if $prinkter ==1;
3318 my @mields = split(/\t/,$line);
3319 my @fields = @mields;
3320 my $microsat = allCaps($fields[$microsatcord]);
3321 my $motifline = allCaps($fields[$motifcord]);
3322 my $microsatcopy = $microsat;
3323 # print "microsat = $microsat|\n" if $prinkter ==1;
3324 $microsatcopy =~ s/^\[|\]$//sg;
3325 chomp $microsatcopy;
3326 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
3327 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
3328 shift @inields;
3329 # print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1;
3330 $motifline =~ s/^\[|\]$//sg;
3331 my @motields = split(/\]\[/,$motifline);
3332 my @firstmotifs = ();
3333 my @lastmotifs = ();
3334 for my $i (0 ... $#microields){
3335 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
3336 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
3337 }
3338 # print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1;
3339 my @mergelist = ();
3340 my @inter_poses = split(/,/,$fields[$interr_poscord]);
3341 my $no_of_interruptions = $fields[$no_of_interruptionscord];
3342 my @interruptions = split(/,/,$fields[$interrcord]);
3343 my @interrtypes = split(/,/,$fields[$interrtypecord]);
3344 my $stopper = 0;
3345 for my $i (0 ... $#motields-1){
3346 # print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1;
3347 if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){
3348 $stopper = 1;
3349 push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1;
3350 }
3351 }
3352
3353 # print "mergelist = @mergelist\n" if $prinkter ==1;
3354 return $line if scalar(@mergelist) == 0;
3355 # print "merging @mergelist\n" if $prinkter ==1;
3356 # <STDIN> if $prinkter ==1;
3357
3358 foreach my $merging (@mergelist){
3359 my @sets = split(/_/, $merging);
3360 # print "sets = @sets\n" if $prinkter ==1;
3361 my @tempmicro = ();
3362 my @tempmot = ();
3363 # print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1;
3364 for my $i (0 ... $sets[0]-1){
3365 # print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1;
3366 push(@tempmicro, "[".$microields[$i]."]");
3367 push(@tempmicro, $inields[$i]);
3368 push(@tempmot, "[".$motields[$i]."]");
3369 # print "adding pre-motifs number $i\n" if $prinkter ==1;
3370 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
3371 }
3372 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
3373 # print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1;
3374 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
3375 # print "middle is, from @motields - @sets, number 0 which is is\n";
3376 # print ": $motields[$sets[0]]\n";
3377 push (@tempmicro, $pusher);
3378 push(@tempmot, "[".$motields[$sets[0]]."]");
3379 push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]];
3380 my $outcoming = -2;
3381 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
3382 # print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1;
3383 for my $i ($sets[1]+1 ... $#microields){
3384 # print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1;
3385 push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i];
3386 push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i];
3387 push(@tempmot, "[".$motields[$i]."]");
3388 # print "adding post-motifs number $i\n" if $prinkter ==1;
3389 $outcoming = $i;
3390 }
3391 # print "____________________________________________________________________________\n";
3392 $prinkter = 0;
3393 $fields[$microsatcord] = join("",@tempmicro);
3394 $fields[$motifcord] = join("",@tempmot);
3395 # print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1;
3396
3397 splice(@interrtypes, $sets[0], 1);
3398 $fields[$interrtypecord] = join(",",@interrtypes);
3399 splice(@interruptions, $sets[0], 1);
3400 $fields[$interrcord] = join(",",@interruptions);
3401 splice(@inter_poses, $sets[0], 1);
3402 $fields[$interr_poscord] = join(",",@inter_poses);
3403 $no_of_interruptions = $no_of_interruptions - 1;
3404 }
3405
3406 if ($no_of_interruptions == 0 && $line !~ /compound/){
3407 $fields[$microsatcord] =~ s/^\[|\]$//sg;
3408 $fields[$motifcord] =~ s/^\[|\]$//sg;
3409 $line = join("\t", @fields[0 ... $motifcord]);
3410 }
3411 else{
3412 $line = join("\t", @fields);
3413 }
3414 # print "post merging, the line is $line\n" if $prinkter ==1;
3415 #<STDIN> if $stopper ==1;
3416 return $line;
3417 }
3418 sub interval_asseser{
3419 my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3];
3420 }
3421 #---------------------------------------------------------------------------------------------------
3422 sub allCaps{
3423 my $motif = $_[0];
3424 $motif =~ s/a/A/g;
3425 $motif =~ s/c/C/g;
3426 $motif =~ s/t/T/g;
3427 $motif =~ s/g/G/g;
3428 return $motif;
3429 }
3430
3431
3432 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
3433
3434
3435 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
3436 sub merge_interruptedMicrosats{
3437 # print "IN merge_interruptedMicrosats: @_\n";
3438 my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
3439 my $input1 = $_[1]; ###### the *_sput_op4_ii file
3440 my $input2 = $_[2]; ###### the *_sput_op4_ii file
3441 $no_of_species = $_[3];
3442
3443 my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward
3444 my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse
3445 my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward
3446 #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse
3447 #my $info = $_[4];
3448 #my @tags = split(/\t/,$info);
3449
3450 open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
3451 open(INF,"<$input1") or die "Cannot open file $input1 $!";
3452 open(INR,"<$input2") or die "Cannot open file $input2 $!";
3453 open(OUTF,">$output1") or die "Cannot open file $output1 $!";
3454 open(OUTR,">$output2") or die "Cannot open file $output2 $!";
3455 open(MER,">$output3") or die "Cannot open file $output3 $!";
3456 #open(MERR,">$output4") or die "Cannot open file $output4 $!";
3457
3458
3459
3460 $printer = 0;
3461
3462 # print "files opened \n";
3463 $infocord = 2 + (4*$no_of_species) - 1;
3464 $startcord = 2 + (4*$no_of_species) + 2 - 1;
3465 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
3466 $endcord = 2 + (4*$no_of_species) + 4 - 1;
3467 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
3468 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
3469 $typecord = $infocord + 1;
3470 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
3471
3472 $interrtypecord = $motifcord + 1;
3473 $interrcord = $motifcord + 2;
3474 $interr_poscord = $motifcord + 3;
3475 $no_of_interruptionscord = $motifcord + 4;
3476 $mergestarts = $no_of_interruptionscord+ 1;
3477 $mergeends = $no_of_interruptionscord+ 2;
3478 $mergemicros = $no_of_interruptionscord+ 3;
3479
3480 # NOW ADDING FORWARD MICROSATELLITES TO HASH
3481 my %fmicros = ();
3482 my $microcounter=0;
3483 my $linecounter = 0;
3484 while (my $line = <INF>){
3485 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
3486 $linecounter++;
3487 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
3488 my $key = join("\t",$1, $2, $4, $5);
3489 # print $key, "#-#-#-#-#-#-#-#\n";
3490 push (@{$fmicros{$key}},$line);
3491 $microcounter++;
3492 }
3493 else {
3494 #print $line;
3495 }
3496 }
3497 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
3498 close INF;
3499 my @deletedlines = ();
3500 # print "done forward hash \n";
3501 $linecounter = 0;
3502 #---------------------------------------------------------------------------------------------------
3503 # NOW ADDING REVERSE MICROSATELLITES TO HASH
3504 my %rmicros = ();
3505 $microcounter=0;
3506 while (my $line = <INR>){
3507 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
3508 $linecounter++;
3509 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
3510 my $key = join("\t",$1, $2, $4, $5);
3511 # print $key, "#-#-#-#-#-#-#-#\n";
3512 push (@{$rmicros{$key}},$line);
3513 $microcounter++;
3514 }
3515 else {
3516 #print "cant make key\n";
3517 }
3518 }
3519 # print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
3520 close INR;
3521 # print "done reverse hash \n";
3522 $linecounter = 0;
3523
3524 #------------------------------------------------------------------------------------------------
3525
3526 while(my $sine = <SEQ>){
3527 #<STDIN> if $sine =~ /16349128/;
3528 next if $sine !~ /[a-zA-Z0-9]/;
3529 # print "-" x 150, "\n" if $printer == 1;
3530 my @sields = split(/\t/,$sine);
3531 my @merged = ();
3532
3533 my $key = ();
3534
3535 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
3536 $key = join("\t",$1, $2, $4, $5);
3537 # print $key, "<-<-<-<-<-<-<-<\n";
3538 }
3539 # print "key = $key\n";
3540
3541 my @sets1;
3542 my @sets2;
3543 chomp $sields[$sequencepos];
3544 my $rev_sequence = reverse($sields[$sequencepos]);
3545 $rev_sequence =~ s/ //g;
3546 $rev_sequence = " ".$rev_sequence;
3547 next if (!exists $fmicros{$key} && !exists $rmicros{$key});
3548
3549 if (exists $fmicros{$key}){
3550 # print "line no : $linecount\n";
3551 my @raw_microstring = @{$fmicros{$key}};
3552 my %starts = (); my %ends = ();
3553 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
3554 my @microstring=();
3555 for my $u (0 ... $#raw_microstring){
3556 my @tields = split(/\t/,$raw_microstring[$u]);
3557 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
3558 push(@microstring, $raw_microstring[$u]);
3559 $starts{$tields[$startcord]} = $tields[$startcord];
3560 $ends{$tields[$endcord]} = $tields[$endcord];
3561 }
3562
3563 # print "founf microstring in forward\n: @microstring\n";
3564 chomp @microstring;
3565 my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos]));
3566 @sets1 = split("\=", $clusterresult);
3567 my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp);
3568 # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>;
3569 } #if (exists $micros{$key}){
3570
3571 if (exists $rmicros{$key}){
3572 # print "line no : $linecount\n";
3573 my @raw_microstring = @{$rmicros{$key}};
3574 my %starts = (); my %ends = ();
3575 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
3576 my @microstring=();
3577 for my $u (0 ... $#raw_microstring){
3578 my @tields = split(/\t/,$raw_microstring[$u]);
3579 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
3580 push(@microstring, $raw_microstring[$u]);
3581 $starts{$tields[$startcord]} = $tields[$startcord];
3582 $ends{$tields[$endcord]} = $tields[$endcord];
3583 }
3584 # print "founf microstring in reverse\n: @microstring\n"; <STDIN>;
3585 chomp @microstring;
3586 # print "sending reversed sequence\n";
3587 my $clusterresult = (find_clusters(@microstring, $rev_sequence ) );
3588 @sets2 = split("\=", $clusterresult);
3589 my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp);
3590 } #if (exists $micros{$key}){
3591
3592 my @popout1 = ();
3593 my @popout2 = ();
3594 my @forwardset = ();
3595 if (exists $sets2[1] ){
3596 if(exists $sets1[0]) {
3597 push (@popout1, $sets1[0],$sets2[1]);
3598 my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));#
3599 print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n";
3600 my @localmerged = split("_X0X_", $forwardset[1]);
3601 my $sequence = $sields[$sequencepos];
3602 $sequence =~ s/ //g;
3603 # print "\nforwardset = @forwardset\n";
3604 for my $j (0 ... $#localmerged){
3605 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
3606 }
3607
3608 push (@merged, @localmerged);
3609
3610 }
3611 else{
3612 my @localmerged = split("_X0X_", $sets2[1]);
3613 my $sequence = $sields[$sequencepos];
3614 $sequence =~ s/ //g;
3615 for my $j (0 ... $#localmerged){
3616 # print "\nlocalmerged = @localmerged\n";
3617 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
3618 }
3619
3620 push (@merged, @localmerged);
3621 }
3622 }
3623 elsif (exists $sets1[0]){
3624 print OUTF join("\n",split("_X0X_", $sets1[0])), "\n";
3625 }
3626
3627 my @reverseset= ();
3628 if (exists $sets1[1]){
3629 if (exists $sets2[0]){
3630 push (@popout2, $sets2[0],$sets1[1]);
3631 # print "popout2 = @popout2\n";
3632 my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos]));
3633 #print "reverseset = $reverseset[1] < --- reverseset1\n";
3634 print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n";
3635 push(@merged, (split("_X0X_", $reverseset[1])));
3636 }
3637 else{
3638 push(@merged, (split("_X0X_", $sets1[1])));
3639 }
3640 }
3641 elsif (exists $sets2[0]){
3642 print OUTR join("\n",split("_X0X_", $sets2[0])), "\n";
3643
3644 }
3645
3646 if (scalar @merged > 0){
3647 my @filtered_merged = split("__",(filterDuplicates_merged(@merged)));
3648 print MER join("\n", @filtered_merged),"\n";
3649 }
3650 # <STDIN> if $sine =~ /16349128/;
3651
3652 }
3653 close(SEQ);
3654 close(INF);
3655 close(INR);
3656 close(OUTF);
3657 close(OUTR);
3658 close(MER);
3659
3660 }
3661 sub find_clusters{
3662 my @input = @_;
3663 my $sequence = pop(@input);
3664 $sequence =~ s/ //g;
3665 my @microstring0 = @input;
3666 # print "IN: find_clusters:\n";
3667 my %microstart=();
3668 my %microend=();
3669 my @nonmerged = ();
3670 my @mergedSet = ();
3671 # print "set of microsats = @microstring \n";
3672 my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0;
3673 # print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1;
3674 #<STDIN> if $printer == 1;
3675 my @tempmicrostring = @microstring;
3676 foreach my $line (@tempmicrostring){
3677 my @fields = split(/\t/,$line);
3678 my $start = $fields[$startcord];
3679 my $end = $fields[$endcord];
3680 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/;
3681 # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
3682 push (@{$microstart{$start}},$line);
3683 push (@{$microend{$end}},$line);
3684 }
3685 my $firstflag = 'down';
3686 while( my $line =shift(@microstring)){
3687 # print "-----------\nline = $line \n" if $printer == 1;
3688 chomp $line;
3689 my @fields = split(/\t/,$line);
3690 my $start = $fields[$startcord];
3691 my $end = $fields[$endcord];
3692 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ;
3693 my $startmicro = $line;
3694 my $endmicro = $line;
3695 # print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
3696
3697 delete ($microstart{$start});
3698 delete ($microend{$end});
3699 my $flag = 'down';
3700 my $startflag = 'down';
3701 my $endflag = 'down';
3702 my $prestart = $start - $distance;
3703 my $postend = $end + $distance;
3704 my @compoundlines = ();
3705 my %compoundhash = ();
3706 push (@compoundlines, $line);
3707 push (@{$compoundhash{$line}},$line);
3708 my $startrank = 1;
3709 my $endrank = 1;
3710
3711 while( ($startflag eq "down") || ($endflag eq "down") ){
3712 # print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1;
3713 if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){
3714 # print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1;
3715 last;
3716 }
3717
3718 $firstflag = "up";
3719 if ($startflag eq "down"){
3720 for my $i ($prestart ... $end){
3721 if(exists $microend{$i}){
3722 chomp $microend{$i}[0];
3723 if(exists $compoundhash{$microend{$i}[0]}) {next;}
3724 chomp $microend{$i}[0];
3725 push(@compoundlines, $microend{$i}[0]);
3726 my @tields = split(/\t/,$microend{$i}[0]);
3727 $startmicro = $microend{$i}[0];
3728 chomp $startmicro;
3729 $flag = 'down';
3730 $startrank++;
3731 # print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1;
3732 delete $microend{$i};
3733 delete $microstart{$tields[$startcord]};
3734 $end = $tields[$endcord];
3735 $startflag = 'down';
3736 $prestart = $tields[$startcord] - $distance;
3737 last;
3738 }
3739 else{
3740 $flag = 'up';
3741 $startflag = 'up';
3742 }
3743 }
3744 }
3745
3746 if ($endflag eq "down"){
3747
3748 for my $i ($start ... $postend){
3749 # print "$start ----> $i -----> $postend\n" if $printer == 1;
3750 if(exists $microstart{$i} ){
3751 chomp $microstart{$i}[0];
3752 if(exists $compoundhash{$microstart{$i}[0]}) {next;}
3753 chomp $microstart{$i}[0];
3754 push(@compoundlines, $microstart{$i}[0]);
3755 my @tields = split(/\t/,$microstart{$i}[0]);
3756 $endmicro = $microstart{$i}[0];
3757 $endrank++;
3758 chomp $endmicro;
3759 $flag = 'down';
3760 # print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1;
3761
3762 delete $microstart{$i} if exists $microstart{$i} ;
3763 delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]};
3764 # print "done\n" if $printer == 1;
3765
3766 shift @microstring;
3767 $end = $tields[$endcord];
3768 $postend = $tields[$endcord] + $distance;
3769 $endflag = 'down';
3770 last;
3771 }
3772 else{
3773 $flag = 'up';
3774 $endflag = 'up';
3775 }
3776 # print "out of the if\n" if $printer == 1;
3777 }
3778 # print "out of the for\n" if $printer == 1;
3779
3780 }
3781 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
3782 } #end while( $flag eq "down")
3783 # print "compoundlines = @compoundlines \n" if $printer == 1;
3784
3785 if (scalar (@compoundlines) == 1){
3786 push(@nonmerged, $line);
3787
3788 }
3789 if (scalar (@compoundlines) > 1){
3790 # print "FROM CLUSTERER\n" if $printer == 1;
3791 push(@mergedSet,merge_microsats(@compoundlines, $sequence) );
3792 }
3793 } #end foreach my $line (@microstring){
3794 # print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1;
3795 #<STDIN> if scalar(@mergedSet) > 0;
3796 # print "EXIT: find_clusters\n";
3797 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet));
3798 }
3799
3800 sub custom {
3801 $a->[$startcord+1] <=> $b->[$startcord+1];
3802 }
3803
3804 sub popOuter {
3805 # print "\nIN: popOuter @_\n"; <STDIN>;
3806 my @all = split ("_X0X_",$_[0]);
3807 # <STDIN> if !defined $_[0];
3808 my @merged = split ("_X0X_",$_[1]);
3809 my $sequence = $_[2];
3810 my $seqlen = length($sequence);
3811 my %microstart=();
3812 my %microend=();
3813 my @mergedSet = ();
3814 my @nonmerged = ();
3815
3816 foreach my $line (@all){
3817 my @fields = split(/\t/,$line);
3818 my $start = $seqlen - $fields[$startcord]+ 1;
3819 my $end = $seqlen - $fields[$endcord] + 1;
3820 push (@{$microstart{$start}},$line);
3821 push (@{$microend{$end}},$line);
3822 }
3823 my $firstflag = 'down';
3824 my %forPopouting = ();
3825
3826 while( my $line =shift(@merged)){
3827 # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ;
3828 chomp $line;
3829 my @fields = split(/\t/,$line);
3830 my $start = $fields[$startcord];
3831 my $end = $fields[$endcord];
3832 my $startmicro = $line;
3833 my $endmicro = $line;
3834
3835
3836 delete ($microstart{$start});
3837 delete ($microend{$end});
3838 my $flag = 'down';
3839 my $startflag = 'down';
3840 my $endflag = 'down';
3841 my $prestart = $start - $distance;
3842 my $postend = $end + $distance;
3843 my @compoundlines = ();
3844 my %compoundhash = ();
3845 push (@compoundlines, $line);
3846 my $startrank = 1;
3847 my $endrank = 1;
3848
3849 # print "\nstart = $start, end = $end\n";
3850 # <STDIN>;
3851 for my $i ($start ... $end){
3852 if(exists $microend{$i}){
3853 # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n";
3854 chomp $microend{$i}[0];
3855 my @fields = split(/\t/,$microend{$i}[0]);
3856 delete $microstart{$seqlen - $fields[$startcord] + 1};
3857 my $invertseq = $sequence;
3858 $invertseq =~ s/ //g;
3859 push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq ));
3860 delete $microend{$i};
3861
3862 }
3863
3864 if(exists $microstart{$i} ){
3865 # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n";
3866
3867 chomp $microstart{$i}[0];
3868 my @fields = split(/\t/,$microstart{$i}[0]);
3869 delete $microend{$seqlen - $fields[$endcord] + 1};
3870 my $invertseq = $sequence;
3871 $invertseq =~ s/ //g;
3872 push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) );
3873 delete $microstart{$i};
3874 }
3875 }
3876
3877 if (scalar (@compoundlines) == 1){
3878 push(@mergedSet,join("\t",@compoundlines) );
3879 }
3880 else {
3881 # print "FROM POPOUTER\n" if $printer == 1;
3882 push(@mergedSet, merge_microsats(@compoundlines, $sequence) );
3883 }
3884 }
3885
3886 foreach my $key (sort keys %microstart) {
3887 push(@nonmerged,$microstart{$key}[0]);
3888 }
3889
3890 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) );
3891 }
3892
3893
3894
3895 sub invert_justCoordinates{
3896 my $microsat = $_[0];
3897 # print "IN invert_justCoordinates ... @_\n" ; <STDIN>;
3898 chomp $microsat;
3899 my $seqLength = $_[1];
3900 my @fields = split(/\t/,$microsat);
3901 my $start = $seqLength - $fields[$endcord] + 1;
3902 my $end = $seqLength - $fields[$startcord] + 1;
3903 $fields[$startcord] = $start;
3904 $fields[$endcord] = $end;
3905 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
3906 # print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1;
3907 return join("\t",@fields);
3908 }
3909
3910 sub largest_number{
3911 my $counter = 0;
3912 my($max) = shift(@_);
3913 foreach my $temp (@_) {
3914 #print "finding largest array: $maxcounter \n";
3915 if($temp > $max){
3916 $max = $temp;
3917 }
3918 }
3919 return($max);
3920 }
3921 sub smallest_number{
3922 my $counter = 0;
3923 my($min) = shift(@_);
3924 foreach my $temp (@_) {
3925 #print "finding largest array: $maxcounter \n";
3926 if($temp < $min){
3927 $min = $temp;
3928 }
3929 }
3930 return($min);
3931 }
3932
3933
3934 sub filterDuplicates_merged{
3935 my @merged = @_;
3936 my %revmerged = ();
3937 my @fmerged = ();
3938 foreach my $micro (@merged) {
3939 my @fields = split(/\t/,$micro);
3940 if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){
3941 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
3942 # print "adding ... \n$key\n$micro\n";
3943 push(@{$revmerged{$key}}, $micro);
3944 }
3945 else{
3946 # print "pushing.. $micro\n";
3947 push(@fmerged, $micro);
3948 }
3949 }
3950 # print "\n";
3951 foreach my $micro (@fmerged) {
3952 my @fields = split(/\t/,$micro);
3953 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
3954 # print "searching for key $key\n";
3955 if (exists $revmerged{$key}){
3956 # print "deleting $revmerged{$key}[0]\n";
3957 delete $revmerged{$key};
3958 }
3959 }
3960 foreach my $key (sort keys %revmerged) {
3961 push(@fmerged,$revmerged{$key}[0]);
3962 }
3963 # print "returning ", join("\n", @fmerged),"\n" ;
3964 return join("__", @fmerged);
3965 }
3966
3967 sub invert_microsat{
3968 my $micro = $_[0];
3969 chomp $micro;
3970 if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;}
3971 else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; }
3972 my $sequence = $_[1];
3973 $sequence =~ s/ //g;
3974 my $seqlen = length($sequence);
3975 my @fields = split(/\t/,$micro);
3976 my $start = $seqlen - $fields[$endcord] +1;
3977 my $end = $seqlen - $fields[$startcord] +1;
3978 $fields[$startcord] = $start;
3979 $fields[$endcord] = $end;
3980 $fields[$motifcord] = reverse_micro($fields[$motifcord]);
3981 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
3982 if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){
3983 my @intertypes = split(/,/,$fields[$interrtypecord]);
3984 my @inters = split(/,/,$fields[$interrcord]);
3985 my @interposes = split(/,/,$fields[$interr_poscord]);
3986 $fields[$interrtypecord] = join(",",reverse(@intertypes));
3987 $fields[$no_of_interruptionscord] = scalar(@interposes);
3988 for my $i (0 ... $fields[$no_of_interruptionscord]-1){
3989 if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){
3990 $inters[$i] = reverse($inters[$i]);
3991 $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1;
3992 }
3993 else{
3994 $inters[$i] = "";
3995 $interposes[$i] = $interposes[$i] - 1;
3996 }
3997 $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1;
3998 }
3999
4000 $fields[$interrcord] = join(",",reverse(@inters));
4001 $fields[$interr_poscord] = join(",",reverse(@interposes));
4002 }
4003
4004 my $finalmicrosat = join("\t", @fields);
4005 return $finalmicrosat;
4006
4007 }
4008 sub reverse_micro{
4009 my $micro = reverse($_[0]);
4010 my @strand = split(/\s*/,$micro);
4011 for my $i (0 ... $#strand){
4012 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
4013 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
4014 }
4015 return join("",@strand);
4016 }
4017
4018 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
4019
4020
4021 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
4022
4023 sub forward_reverse_sputoutput_comparer {
4024 # print "IN forward_reverse_sputoutput_comparer: @_\n";
4025 my $input0 = $_[0]; ###### the *nogap_unrand_match file
4026 my $input1 = $_[1]; ###### the real file, *sput* data
4027 my $input2 = $_[2]; ###### the reverse file, *sput* data
4028 my $output1 = $_[3]; ###### microsats different in real file
4029 my $output2 = $_[4]; ###### microsats missing in real file
4030 my $output3 = $_[5]; ###### microsats common among real and reverse file
4031 my $no_of_species = $_[6];
4032
4033 $infocord = 2 + (4*$no_of_species) - 1;
4034 $typecord = 2 + (4*$no_of_species) + 1 - 1;
4035 $startcord = 2 + (4*$no_of_species) + 2 - 1;
4036 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
4037 $endcord = 2 + (4*$no_of_species) + 4 - 1;
4038 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
4039 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
4040 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
4041 $interrtypecord = $motifcord + 1;
4042 $interrcord = $motifcord + 2;
4043 $interr_poscord = $motifcord + 3;
4044 $no_of_interruptionscord = $motifcord + 4;
4045 $mergestarts = $no_of_interruptionscord+ 1;
4046 $mergeends = $no_of_interruptionscord+ 2;
4047 $mergemicros = $no_of_interruptionscord+ 3;
4048
4049
4050 open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
4051 open(INF,"<$input1") or die "Cannot open file $input1 $!";
4052 open(INR,"<$input2") or die "Cannot open file $input2 $!";
4053
4054 open(DIFF,">$output1") or die "Cannot open file $output1 $!";
4055 #open(MISS,">$output2") or die "Cannot open file $output2 $!";
4056 open(SAME,">$output3") or die "Cannot open file $output3 $!";
4057
4058
4059 # print "opened files \n";
4060 my $linecounter = 0;
4061 my $fcounter = 0;
4062 my $rcounter = 0;
4063
4064 $printer = 0;
4065 #---------------------------------------------------------------------------------------------------
4066 # NOW ADDING FORWARD MICROSATELLITES TO HASH
4067 my %fmicros = ();
4068 my $microcounter=0;
4069 while (my $line = <INF>){
4070 $linecounter++;
4071 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
4072 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
4073 # print $key, "#-#-#-#-#-#-#-#\n";
4074 push (@{$fmicros{$key}},$line);
4075 $microcounter++;
4076 }
4077 else {
4078 #print $line;
4079 }
4080 }
4081 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
4082 close INF;
4083 my @deletedlines = ();
4084 # print "done forward hash \n";
4085 $linecounter = 0;
4086 #---------------------------------------------------------------------------------------------------
4087 # NOW ADDING REVERSE MICROSATELLITES TO HASH
4088 my %rmicros = ();
4089 $microcounter=0;
4090 while (my $line = <INR>){
4091 $linecounter++;
4092 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
4093 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
4094 # print $key, "#-#-#-#-#-#-#-#\n";
4095 push (@{$rmicros{$key}},$line);
4096 $microcounter++;
4097 }
4098 else {}
4099 }
4100 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
4101 close INR;
4102 # print "done reverse hash \n";
4103 $linecounter = 0;
4104 #---------------------------------------------------------------------------------------------------
4105 #---------------------------------------------------------------------------------------------------
4106 # NOW READING THE SEQUENCE FILE
4107 while(my $sine = <SEQ>){
4108 my %microstart=();
4109 my %microend=();
4110 my @sields = split(/\t/,$sine);
4111 my $key = ();
4112 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
4113 $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
4114 }
4115 else {
4116 next;
4117 }
4118 $printer = 0;
4119 my $sequence = $sields[$sequencepos];
4120 chomp $sequence;
4121 $sequence =~ s/ //g;
4122 my @localfs = ();
4123 my @localrs = ();
4124
4125 if (exists $fmicros{$key}){
4126 @localfs = @{$fmicros{$key}};
4127 delete $fmicros{$key};
4128 }
4129
4130 my %forwardstarts = ();
4131 my %forwardends = ();
4132
4133 foreach my $f (@localfs){
4134 my @fields = split(/\t/,$f);
4135 push (@{$forwardstarts{$fields[$startcord]}},$f);
4136 push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]);
4137 }
4138
4139 if (exists $rmicros{$key}){
4140 @localrs = @{$rmicros{$key}};
4141 delete $rmicros{$key};
4142 }
4143 else{
4144 }
4145
4146 foreach my $r (@localrs){
4147 chomp $r;
4148 my @rields = split(/\t/,$r);
4149 # print "rields = @rields\n" if $printer == 1;
4150 my $reciprocalstart = length($sequence) - $rields[$endcord] + 1;
4151 my $reciprocalend = length($sequence) - $rields[$startcord] + 1;
4152 # print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1;
4153 my $microsat = reverse_micro(all_caps($rields[$microsatcord]));
4154 my @localcollection=();
4155 for my $i ($reciprocalstart+1 ... $reciprocalend-1){
4156 if (exists $forwardstarts{$i}){
4157 push(@localcollection, $forwardstarts{$i}[0] );
4158 delete $forwardstarts{$i};
4159 }
4160 if (exists $forwardends{$i}){
4161 next if !exists $forwardstarts{$forwardends{$i}[0]};
4162 push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] );
4163 }
4164 }
4165 if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);}
4166
4167 if (scalar(@localcollection) == 0){
4168 print SAME invert_microsat($r,($sequence) ), "\n";
4169 }
4170
4171 elsif (scalar(@localcollection) == 1){
4172 # print "f microsat = $localcollection[0]\n" if $printer == 1;
4173 my @lields = split(/\t/,$localcollection[0]);
4174 $lields[$microsatcord]=all_caps($lields[$microsatcord]);
4175 # print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1;
4176 # print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1;
4177 if ($microsat eq $lields[$microsatcord]){
4178 chomp $localcollection[0];
4179 print SAME $localcollection[0], "\n";
4180 }
4181 if ($microsat ne $lields[$microsatcord]){
4182 chomp $localcollection[0];
4183 my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence);
4184 # print "newmicro = $newmicro\n" if $printer == 1;
4185 if ($newmicro =~ /[a-zA-Z]/){
4186 print SAME $newmicro,"\n";
4187 }
4188 else{
4189 print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n";
4190 # print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1;
4191 # print "@rields\n@lields\n" if $printer == 1;
4192 }
4193 }
4194 }
4195 else{
4196 # print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1;
4197 }
4198 }
4199 }
4200
4201 close(SEQ);
4202 close(INF);
4203 close(INR);
4204 close(DIFF);
4205 close(SAME);
4206
4207 }
4208 sub all_caps{
4209 my @strand = split(/\s*/,$_[0]);
4210 for my $i (0 ... $#strand){
4211 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
4212 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
4213 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
4214 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
4215 }
4216 return join("",@strand);
4217 }
4218 sub microsatChooser{
4219 my $forward = $_[0];
4220 my $reverse = $_[1];
4221 my $sequence = $_[2];
4222 my $seqLength = length($sequence);
4223 $sequence =~ s/ //g;
4224 my @fields = split(/\t/,$forward);
4225 my @rields = split(/\t/,$reverse);
4226 my $r_start = $seqLength - $rields[$endcord] + 1;
4227 my $r_end = $seqLength - $rields[$startcord] + 1;
4228
4229
4230 my $f_microsat = $fields[$microsatcord];
4231 my $r_microsat = $rields[$microsatcord];
4232
4233 if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) {
4234 return $forward if length($f_microsat) >= length($r_microsat);
4235 return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat);
4236 }
4237 return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord];
4238
4239 my $f_microsat_copy = $f_microsat;
4240 my $r_microsat_copy = $r_microsat;
4241 $f_microsat_copy =~ s/^\[|\]$//g;
4242 $r_microsat_copy =~ s/^\[|\]$//g;
4243
4244 my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
4245 my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
4246 my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
4247 my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
4248
4249 my $f_motif = $fields[$motifcord];
4250 my $r_motif = $rields[$motifcord];
4251 my $f_motif_copy = $f_motif;
4252 my $r_motif_copy = $r_motif;
4253 $f_motif_copy =~ s/^\[|\]$//g;
4254 $r_motif_copy =~ s/^\[|\]$//g;
4255
4256 my @f_motields = split(/\]\[/,$f_motif_copy);
4257 my @r_motields = split(/\]\[/,$r_motif_copy);
4258
4259 my $f_purestretch = join("",@f_microields);
4260 my $r_purestretch = join("",@r_microields);
4261
4262 if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){
4263 # print "now.. studying $forward\n$reverse\n" if $printer == 1;
4264 if ($fields[$typecord] eq $rields[$typecord]){
4265 # print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
4266
4267 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
4268 my $subset_answer = isSubset($forward, $reverse, $seqLength);
4269 # print "subset answer = $subset_answer\n" if $printer == 1;
4270 return $forward if $subset_answer == 1;
4271 return invert_microsat($reverse, $sequence) if $subset_answer == 2;
4272 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
4273 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
4274 return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch);
4275 return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch);
4276 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ;
4277 }
4278 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
4279 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4280 }
4281 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
4282 return $forward;
4283 }
4284 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
4285 return invert_microsat($reverse, $sequence);
4286 }
4287 }
4288 else{
4289 my $fmotlen = ();
4290 my $rmotlen = ();
4291 $fmotlen =1 if $fields[$typecord] eq "mononucleotide";
4292 $fmotlen =2 if $fields[$typecord] eq "dinucleotide";
4293 $fmotlen =3 if $fields[$typecord] eq "trinucleotide";
4294 $fmotlen =4 if $fields[$typecord] eq "tetranucleotide";
4295 $rmotlen =1 if $rields[$typecord] eq "mononucleotide";
4296 $rmotlen =2 if $rields[$typecord] eq "dinucleotide";
4297 $rmotlen =3 if $rields[$typecord] eq "trinucleotide";
4298 $rmotlen =4 if $rields[$typecord] eq "tetranucleotide";
4299
4300 if ($fmotlen < $rmotlen){
4301 if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){
4302 return $forward;
4303 }
4304 else{
4305 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4306 }
4307 }
4308 if ($fmotlen > $rmotlen){
4309 if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){
4310 return invert_microsat($reverse, $sequence);
4311 }
4312 else{
4313 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4314 }
4315 }
4316 }
4317 }
4318 if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){
4319 # print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
4320 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
4321 my $subset_answer = isSubset($forward, $reverse, $seqLength);
4322 # print "subset answer = $subset_answer\n" if $printer == 1;
4323 return $forward if $subset_answer == 1;
4324 return invert_microsat($reverse, $sequence) if $subset_answer == 2;
4325 # print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1;
4326 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
4327 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
4328 if ($subset_answer == 3){
4329 if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){
4330 if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){
4331 return $forward;
4332 }
4333 else{
4334 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4335 }
4336 }
4337 if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){
4338 if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){
4339 return invert_microsat($reverse, $sequence);
4340 }
4341 else{
4342 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4343 }
4344 }
4345 }
4346 }
4347 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
4348 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4349 }
4350 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
4351 return $forward;
4352 }
4353 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
4354 return invert_microsat($reverse, $sequence);
4355 }
4356
4357 }
4358
4359 if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){
4360 # print "one compound, one nucleotide\n" if $printer == 1;
4361 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4362 }
4363 if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){
4364 # print "one compound, one nucleotide\n" if $printer == 1;
4365 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
4366 }
4367 }
4368
4369 sub isSubset{
4370 my $forward = $_[0]; my @fields = split(/\t/,$forward);
4371 my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
4372 my $seqLength = $_[2];
4373 my $r_start = $seqLength - $rields[$endcord] + 1;
4374 my $r_end = $seqLength - $rields[$startcord] + 1;
4375 # print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1;
4376 return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end;
4377 return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end;
4378 return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord];
4379 return "3";
4380 }
4381
4382 sub motifBYmotif_match{
4383 my $forward = $_[0];
4384 my $reverse = $_[1];
4385 $forward =~ s/^\[|\]$//g;
4386 $reverse =~ s/^\[|\]$//g;
4387 my @f_motields=split(/\]\[/, $forward);
4388 my @r_motields=split(/\]\[/, $reverse);
4389 my $finalresult = 0;
4390
4391 if (scalar(@f_motields) != scalar(@r_motields)){
4392 my $subresult = 0;
4393 my @mega = (); my @sub = ();
4394 @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields);
4395 @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields);
4396 @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields);
4397 @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields);
4398
4399 for my $i (0 ... $#sub){
4400 my $probe = $sub[$i].$sub[$i];
4401 # print "probing $probe and $mega[$i]\n" if $printer == 1;
4402 if ($probe =~ /$mega[$i]/) {$subresult = 1; }
4403 else {$subresult = 0; last; }
4404 }
4405
4406 return 0 if $subresult == 0;
4407 return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f
4408 return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse
4409
4410 }
4411 else{
4412 for my $i (0 ... $#f_motields){
4413 my $probe = $f_motields[$i].$f_motields[$i];
4414 if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;}
4415 else {$finalresult = 0 ;last;}
4416 }
4417 }
4418 # print "finalresult = $finalresult\n" if $printer == 1;
4419 return $finalresult;
4420 }
4421
4422 sub merge_microsats{
4423 my @input = @_;
4424 my $sequence = pop(@input);
4425 $sequence =~ s/ //g;
4426 my @seq_string = @input;
4427 # print "IN: merge_microsats\n";
4428 # print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n";
4429 my $start;
4430 my $end;
4431 my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string;
4432 # print "\nrearranged into @micros \n";
4433 my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = ();
4434 my @fields = ();
4435 for my $i (0 ... $#micros){
4436 chomp $micros[$i];
4437 @fields = split(/\t/,$micros[$i]);
4438 push(@types, $fields[$typecord]);
4439 push(@motifs, $fields[$motifcord]);
4440
4441 if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);}
4442 else { push(@interruptiontypes, "NA"); }
4443 if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);}
4444 else { push(@interruptions, "NA"); }
4445 if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);}
4446 else { push(@interrposes, "NA"); }
4447 if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);}
4448 else { push(@no_of_interruptions, "NA"); }
4449 if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));}
4450 else { push(@mergestart, $fields[$startcord]); }
4451 if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));}
4452 else { push(@mergeend, $fields[$endcord]); }
4453 if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);}
4454 else { push(@mergemicro, $fields[$microsatcord]); }
4455
4456
4457 }
4458 $start = smallest_number(@mergestart);
4459 $end = largest_number(@mergeend);
4460 my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]";
4461 my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro));
4462 return $microsat;
4463 }
4464
4465 sub slided_microsat{
4466 my $forward = $_[0]; my @fields = split(/\t/,$forward);
4467 my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
4468 my $seqLength = $_[2];
4469 my $r_start = $seqLength - $rields[$endcord] + 1;
4470 my $r_end = $seqLength - $rields[$startcord] + 1;
4471 my $motlen =();
4472 $motlen =1 if $fields[$typecord] eq "mononucleotide";
4473 $motlen =2 if $fields[$typecord] eq "dinucleotide";
4474 $motlen =3 if $fields[$typecord] eq "trinucleotide";
4475 $motlen =4 if $fields[$typecord] eq "tetranucleotide";
4476
4477 if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) {
4478 return 0;
4479 }
4480 else{
4481 return 1;
4482 }
4483
4484 }
4485
4486 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
4487
4488
4489
4490 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
4491 sub new_multispecies_t10{
4492 my $input1 = $_[0]; #gap_op_unrand_match
4493 my $input2 = $_[1]; #sput
4494 my $output = $_[2]; #output
4495 my $bin = $output."_bin";
4496 my $orgs = join("|",split(/\./,$_[3]));
4497 my @organisms = split(/\./,$_[3]);
4498 my $no_of_species = scalar(@organisms); #3
4499 my $t10info = $output."_info";
4500 $prinkter = 0;
4501
4502 open (MATCH, "<$input1");
4503 open (SPUT, "<$input2");
4504 open (OUT, ">$output");
4505 open (INFO, ">$t10info");
4506
4507
4508 sub microsat_bracketer;
4509 sub custom;
4510 my %seen = ();
4511 $infocord = 2 + (4*$no_of_species) - 1;
4512 $typecord = 2 + (4*$no_of_species) + 1 - 1;
4513 $startcord = 2 + (4*$no_of_species) + 2 - 1;
4514 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
4515 $endcord = 2 + (4*$no_of_species) + 4 - 1;
4516 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
4517 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
4518 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
4519 #---------------------------------------------------------------------------------------------------------------#
4520 # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES #
4521 #---------------------------------------------------------------------------------------------------------------#
4522 my $linecounter = 0;
4523 my $microcounter = 0;
4524 while (my $line = <SPUT>){
4525 chomp $line;
4526 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
4527 next if $line !~ /[0-9a-z]+/;
4528 $linecounter++;
4529 # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
4530 # print $key, "#-#-#-#-#-#-#-#\n";
4531 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
4532 my $key = join("\t",$1, $2, $3, $4, $5);
4533 # print "key = $key\n" if $prinkter == 1;
4534 push (@{$seen{$key}},$line);
4535 $microcounter++;
4536 }
4537 else {
4538 #print "could not make ker in SPUT : \n$line \n";
4539 }
4540 }
4541 # print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
4542 # print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
4543 close SPUT;
4544
4545 #----------------------------------------------------------------------------------------------------------------
4546
4547 #-------------------------------------------------------------------------------------------------------#
4548 # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY #
4549 # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN #
4550 # SECOND FILE FOLLOWED BY #
4551 # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION #
4552 #-------------------------------------------------------------------------------------------------------#
4553 my $ref = 0;
4554 my $ref2 = 0;
4555 my $ref3 = 0;
4556 my $ref4 = 0;
4557 my $deletes= 0;
4558 my $duplicates = 0;
4559 my $neighbors = 0;
4560 my $tooshort = 0;
4561 my $prevmicrol=();
4562 my $startnotfound = 0;
4563 my $matchkeysformed = 0;
4564 my $keysused = 0;
4565
4566 while (my $line = <MATCH>) {
4567 # print colored ['magenta'], $line if $prinkter == 1;
4568 next if $line !~ /[a-zA-Z0-9]/;
4569 chomp $line;
4570 my @fields2 = split(/\t/,$line);
4571 my $key2 = ();
4572 # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
4573 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
4574 $matchkeysformed++;
4575 $key2 = join("\t",$1, $2, $3, $4, $5);
4576 # print "key = $key2 \n" if $prinkter == 1;
4577 }
4578 else{
4579 # print "could not make ker in SEQ : $line\n";
4580 next;
4581 }
4582 my $sequence = $fields2[$sequencepos];
4583 $sequence =~ s/\*/-/g;
4584 my $count = 0;
4585 if (exists $seen{$key2}){
4586 $keysused++;
4587 my @unsorted_raw = @{$seen{$key2}};
4588 delete $seen{$key2};
4589 my @sequencearr = split(/\s*/, $sequence);
4590
4591 # print "sequencearr = @sequencearr\n" if $prinkter == 1;
4592
4593 my $counter;
4594
4595 my %start_database = ();
4596 my %end_database = ();
4597 foreach my $uns (@unsorted_raw){
4598 my @uields = split(/\t/,$uns);
4599 $start_database{$uields[$startcord]} = $uns;
4600 $end_database{$uields[$endcord]} = $uns;
4601 }
4602
4603 my @unsorted = ();
4604 my %starts = (); my %ends = ();
4605 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;}
4606 for my $u (0 ... $#unsorted_raw){
4607 my @tields = split(/\t/,$unsorted_raw[$u]);
4608 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
4609 push(@unsorted, $unsorted_raw[$u]);
4610 $starts{$tields[$startcord]} = $unsorted_raw[$u];
4611 # print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1;
4612 }
4613
4614 my $basecounter= 0;
4615 my $gapcounter = 0;
4616 my $poscounter = 0;
4617
4618 for my $s (@sequencearr){
4619
4620 $poscounter++;
4621 if ($s eq "-"){
4622 $gapcounter++; next;
4623 }
4624 else{
4625 $basecounter++;
4626 }
4627
4628
4629 #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
4630 #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
4631 #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1;
4632
4633 if (exists $starts{$basecounter}){
4634 my $locus = $starts{$basecounter};
4635 # print "locus identified = $locus\n" if $prinkter == 1;
4636 my @fields3 = split(/\t/,$locus);
4637 my $start = $fields3[$startcord];
4638 my $end = $fields3[$endcord];
4639 my $motif = $fields3[$motifcord];
4640 my $microsat = $fields3[$microsatcord];
4641 my @leftbracketpos = ();
4642 my @rightbracketpos = ();
4643 my $bracket_picker = 'no';
4644 my $leftbrackets=();
4645 my $rightbrackets = ();
4646 my $micro_cpy = $microsat;
4647 # print "microsat = $microsat\n" if $prinkter == 1;
4648 while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';}
4649 while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);}
4650 $microsat =~ s/[\[\]\-\*]//g;
4651 # print "microsat = $microsat\n" if $prinkter == 1;
4652 my $human_search = join '-*', split //, $microsat;
4653 my $temp = substr($sequence, $poscounter-1);
4654 # print "with poscounter = $poscounter\n" if $prinkter == 1;
4655 my $search_result = ();
4656 my $posnow = ();
4657
4658 # print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search;
4659 # <STDIN> if !defined $temp || !defined $human_search;
4660
4661 while ($temp =~ /($human_search)/gi){
4662 $search_result = $1;
4663 # $posnow = pos($temp);
4664 last;
4665 }
4666
4667 my @gapspos = ();
4668 next if !defined $search_result;
4669
4670 while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); }
4671 my $gaps = join("__",@gapspos);
4672
4673 my $final_microsat = $search_result;
4674 if ($bracket_picker eq "yes"){
4675 $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets);
4676 }
4677
4678 my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat);
4679
4680 if ($bracket_picker eq "yes") {
4681 $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]);
4682 }
4683 print OUT $outsentence,"\n";
4684 }
4685 }
4686 }
4687 }
4688 my $unusedkeys = scalar(keys %seen);
4689 # print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
4690 # print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
4691 # print INFO "microsatellites too short = $tooshort\n";
4692 # print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
4693
4694 #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
4695 #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
4696 #print "microsatellites too short = $tooshort\n";
4697 #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
4698 #print "unused keys = \n",join("\n", (keys %seen)),"\n";
4699 close (MATCH);
4700 close (SPUT);
4701 close (OUT);
4702 close (INFO);
4703 }
4704
4705 sub microsat_bracketer{
4706 # print "in bracketer: @_\n";
4707 my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_;
4708 my @gaps = split(/__/,$gapspos);
4709 my @lefts = split(/__/,$leftbracketpos);
4710 my @rights = split(/__/,$rightbracketpos);
4711 my @new=();
4712 my $pure = $microsat;
4713 $pure =~ s/-//g;
4714 my $off = 0;
4715 my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights);
4716 push(@gaps, 0);
4717 push(@lefts,0);
4718 push(@rights,0);
4719
4720 for my $i (1 ... $finallength){
4721 # print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n";
4722 if($rights[0] == $i){
4723 # print "pushed a ]\n";
4724 push(@new, "]");
4725 shift(@rights);
4726 push(@rights,0);
4727 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
4728 next;
4729 }
4730 if($gaps[0] == $i){
4731 # print "pushed a -\n";
4732 push(@new, "-");
4733 shift(@gaps);
4734 push(@gaps, 0);
4735 for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;}
4736 for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;}
4737
4738 next;
4739 }
4740 if($lefts[0] == $i){
4741 # print "pushed a [\n";
4742 push(@new, "[");
4743 shift(@lefts);
4744 push(@lefts,0);
4745 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
4746 next;
4747 }
4748 else{
4749 my $pushed = substr($pure,$off,1);
4750 $off++;
4751 push(@new,$pushed );
4752 # print "pushed an alphabet, now new = @new, pushed = $pushed\n";
4753 next;
4754 }
4755 }
4756 my $returnmicrosat = join("",@new);
4757 # print "final microsat = $returnmicrosat \n";
4758 return($returnmicrosat);
4759 }
4760
4761 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
4762
4763
4764 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
4765 sub multiSpecies_orthFinder4{
4766 #print "IN multiSpecies_orthFinder4: @_\n";
4767 my @handles = ();
4768 #1 SEPT 30TH 2008
4769 #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER
4770 #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL
4771 #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST
4772 #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED
4773 #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY
4774 #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES.
4775 #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE
4776 #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE
4777 #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN
4778 #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl
4779
4780 #1 trial run:
4781 #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50
4782
4783 $prinkter=0;
4784
4785 #############
4786 my $CLUSTER_DIST = $_[4];
4787 #############
4788
4789
4790 my $aligns = $_[0];
4791 my @micros = split(/:/, $_[1]);
4792 my $orth = $_[2];
4793 #my $not_orth = "notorth";
4794 @tags = split(/:/, $_[3]);
4795
4796 $no_of_species=scalar(@tags);
4797 my $junkfile = $orth."_junk";
4798 #open(JUNK,">$junkfile");
4799
4800 #my $info = $output1."_info";
4801 #print "inputs are : \n"; foreach(@micros){print $_,"\n";}
4802 #print "info = @_\n";
4803
4804
4805 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
4806 open (ORTH, ">$orth");
4807 my $output=$orth."_out";
4808 open (OUTP, ">$output");
4809
4810
4811 #open (NORTH, ">$not_orth");
4812 #open (INF, ">$info");
4813 my $i = 0;
4814 foreach my $path (@micros){
4815 $handles[$i] = IO::Handle->new();
4816 open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!";
4817 $i++;
4818 }
4819
4820 #print "Opened files\n";
4821
4822
4823 $infocord = 2 + (4*$no_of_species) - 1;
4824 $typecord = 2 + (4*$no_of_species) + 1 - 1;
4825 $motifcord = $typecord + 1;
4826 $gapcord = $motifcord+1;
4827 $startcord = $gapcord + 1;
4828 $strandcord = $startcord + 1;
4829 $endcord = $strandcord + 1;
4830 $microsatcord = $endcord + 1;
4831 $sequencepos = 2 + (4*$no_of_species) + 1 -1 ;
4832 #$sequencepos = 17;
4833 # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES
4834 #----------------------------------------------------------------------------------------------------------------
4835 my @hasharr = ();
4836 foreach my $path (@micros){
4837 open(READ, "<$path") or die "Cannot open file $path :$!";
4838 my %single_hash = ();
4839 my $key = ();
4840 my $counter = 0;
4841 while (my $line = <READ>){
4842 $counter++;
4843 # print $line;
4844 chomp $line;
4845 my @fields1 = split(/\t/,$line);
4846 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
4847 $key = join("\t",$1, $2, $4, $5);
4848
4849 # print "key = : $key\n" if $prinkter == 1;
4850
4851 # print $line if $prinkter == 1;
4852 push (@{$single_hash{$key}},$line);
4853 }
4854 else{
4855 # print "microsat line incompatible\n";
4856 }
4857 }
4858 push @hasharr, {%single_hash};
4859 # print "@{$single_hash{$key}} \n";
4860 # print "done $path: counter = $counter\n" if $prinkter == 1;
4861 close READ;
4862 }
4863 # print "Done hashes\n";
4864 #----------------------------------------------------------------------------------------------------------------
4865 my $question=();
4866 #----------------------------------------------------------------------------------------------------------------
4867 my @contigstarts = ();
4868 my @contigends = ();
4869
4870 my %contigclusters = ();
4871 my %contigclustersFirstStartOnly=();
4872 my %contigclustersLastEndOnly=();
4873 my %contigclustersLastEndLengthOnly=();
4874 my %contigclustersFirstStartLengthOnly=();
4875 my %contigpath=();
4876 my $dotcounter = 0;
4877 while (my $line = <BO>){
4878 # print "x" x 60, "\n" if $prinkter == 1;
4879 $dotcounter++;
4880 # print "." if $dotcounter % 100 ==0;
4881 # print "\n" if $dotcounter % 5000 ==0;
4882 next if $line !~ /^[0-9]+/;
4883 # print $line if $prinkter == 1;
4884 chomp $line;
4885 my @fields2 = split(/\t/,$line);
4886 my $key2 = ();
4887 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
4888 $key2 = join("\t",$1, $2, $4, $5);
4889 }
4890 else {
4891 # print "seq line $line incompatible\n" if $prinkter == 1;
4892 next;}
4893
4894
4895
4896
4897
4898
4899 my @sequences = ();
4900 for (0 ... $#tags){
4901 my $seq = <BO>;
4902 # print $seq;
4903 chomp $seq;
4904 push(@sequences , " ".$seq);
4905 }
4906 my @origsequences = @sequences;
4907 my $seqcopy = $sequences[0];
4908 my @strings = ();
4909 $seqcopy =~ s/[a-zA-Z]|-/x/g;
4910 my @string = split(/\s*/,$seqcopy);
4911
4912 for my $s (0 ... $#tags){
4913 $sequences[$s] =~ s/-//g;
4914 $sequences[$s] =~ s/[a-zA-Z]/x/g;
4915 # print "length of sequence = ",length($sequences[$s]),"\n";
4916 my @tempstring = split(/\s*/,$sequences[$s]);
4917 push(@strings, [@tempstring])
4918
4919 }
4920
4921 my @species_list = ();
4922 my @micro_count = 0;
4923 my @starthash = ();
4924 my $stopper = 1;
4925 my @endhash = ();
4926
4927 my @currentcontigstarts=();
4928 my @currentcontigends=();
4929 my @currentcontigchrs=();
4930
4931 for my $i (0 ... $#tags){
4932 # print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1;
4933 my @temparr = ();
4934
4935 if (exists $hasharr[$i]{$key2}){
4936 @temparr = @{$hasharr[$i]{$key2}};
4937
4938 $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/;
4939 ## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
4940 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
4941 my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
4942 my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
4943 $contigstarts[$i]{$startkey}= $key2;
4944 $contigends[$i]{$endkey}= $key2;
4945 # print "confirming existance: \n" if $prinkter == 1;
4946 # print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1;
4947 # print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1;
4948 $currentcontigchrs[$i]=$1;
4949 $currentcontigstarts[$i]=$2;
4950 $currentcontigends[$i]=$3;
4951
4952 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
4953 else {
4954 push (@starthash, {0 => "0"});
4955 push (@endhash, {0 => "0"});
4956 $currentcontigchrs[$i] = 0;
4957 next;
4958 }
4959 $stopper = 0;
4960 # print "exists: @temparr\n" if $prinkter == 1;
4961 push(@micro_count, scalar(@temparr));
4962 push(@species_list, [@temparr]);
4963 my @tempstart = (); my @tempend = ();
4964 my %localends = ();
4965 my %localhash = ();
4966 # print "---------------------------\n";
4967
4968 foreach my $templine (@temparr){
4969 # print "templine = $templine\n" if $prinkter == 1;
4970 my @tields = split(/\t/,$templine);
4971 my $start = $tields[$startcord]; # - $tields[$gapcord];
4972 my $end = $tields[$endcord]; #- $tields[$gapcord];
4973 my $realstart = $tields[$startcord]- $tields[$gapcord];
4974 my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g);
4975 $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/;
4976 # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n";
4977 my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat;
4978 # print "real start = $realstart, realend = $realend \n";
4979 for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;}
4980 push(@tempstart, $start);
4981 push(@tempend, $end);
4982 $localhash{$start."-".$end} = $templine;
4983 }
4984 push @starthash, {%localhash};
4985 my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST);
4986
4987 # print "foundclusters = $foundclusters\n";
4988
4989 my @clusters = split(/_/,$foundclusters);
4990
4991 my $clustno = 0;
4992
4993 foreach my $cluster (@clusters) {
4994 my @constituenst = split(/,/,$cluster);
4995 # print "clusters returned: @constituenst\n" if $prinkter == 1;
4996 }
4997
4998 @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters));
4999
5000
5001 }
5002 next if $stopper == 1;
5003
5004 # print colored ['blue'],"FINAL:\n" if $prinkter == 1;
5005 my $finalclusters =findClusters(join("!",@string), 1);
5006 # print "finalclusters = $finalclusters\n";
5007 # print colored ['blue'],"----------------------\n" if $prinkter == 1;
5008 my @clusters = split(/,/,$finalclusters);
5009 # print "@string\n" if $prinkter == 1;
5010 # print "@clusters\n" if $prinkter == 1;
5011 # print "------------------------------------------------------------------\n" if $prinkter == 1;
5012
5013 my $clustno = 0;
5014
5015 # foreach my $cluster (@clusters) {
5016 # my @constituenst = split(/,/,$cluster);
5017 # print "clusters returned: @constituenst\n";
5018 # }
5019
5020 next if (scalar @clusters == 0);
5021
5022 my @contigcluster=();
5023 my $clusterno=0;
5024 my @contigClusterstarts=();
5025 my @contigClusterends = ();
5026
5027 foreach my $clust (@clusters){
5028 # print "cluster: $clust\n";
5029 $clusterno++;
5030 my @localclust = split(/\./, $clust);
5031 my @result = ();
5032 my @starts = ();
5033 my @ends = ();
5034
5035 for my $i (0 ... $#localclust){
5036 # print "localclust[$i]: $localclust[$i]\n";
5037 my @pattern = split(/:/, $localclust[$i]);
5038 my @cords = split(/-/, $pattern[1]);
5039 push (@starts, $cords[0]);
5040 push (@ends, $cords[1]);
5041 }
5042
5043 my $extremestart = smallest_number(@starts);
5044 my $extremeend = largest_number(@ends);
5045 push(@contigClusterstarts, $extremestart);
5046 push(@contigClusterends, $extremeend);
5047 # print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ;
5048
5049 foreach my $clustparts (@localclust){
5050 my @pattern = split(/:/, $clustparts);
5051 # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n";
5052 push (@result, $starthash[$pattern[0]]{$pattern[1]});
5053 }
5054 push(@contigcluster, join("\t", @result));
5055 # print join("\t", @result),"<-result \n" if $prinkter == 1 ;
5056 }
5057
5058
5059 my $firstclusterstart = smallest_number(@contigClusterstarts);
5060 my $lastclusterend = largest_number(@contigClusterends);
5061
5062
5063 $contigclustersFirstStartOnly{$key2}=$firstclusterstart;
5064 $contigclustersLastEndOnly{$key2} = $lastclusterend;
5065 $contigclusters{$key2}=[ @contigcluster ];
5066 # print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1;
5067 for my $i (0 ... $#tags){
5068 #1 check if there exists adjacent alignment block wrt coordinates of this species.
5069 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
5070 #2 no need to worry about proximity of anything in adjacent block!
5071
5072 #1 BELOW, the following is really to calclate the distance between the end coordinate of the
5073 #2 cluster and the end of the gap-free sequence of each species. this is so that if an
5074 #3 adjacent alignment block is found lateron, the exact distance between the potentially
5075 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
5076 #5 immediately below.
5077 # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1;
5078
5079 my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart);
5080 my $species_endsubstring = ();
5081
5082 if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";}
5083 else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);}
5084
5085 # print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1;
5086 # print "for species: $tags[$i]: \n" if $prinkter == 1;
5087
5088 $species_startsubstring =~ s/-| //g;
5089 $species_endsubstring =~ s/-| //g;
5090 $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring);
5091 $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring);
5092
5093
5094
5095 # print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1;
5096 # print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1;
5097 # print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1;
5098
5099 # print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1;
5100
5101 }
5102
5103
5104 }
5105 # print "\ndone the job of filling... \n";
5106 #///////////////////////////////////////////////////////////////////////////////////////
5107 #///////////////////////////////////////////////////////////////////////////////////////
5108 #///////////////////////////////////////////////////////////////////////////////////////
5109 #///////////////////////////////////////////////////////////////////////////////////////
5110 $prinkter=0;
5111 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
5112
5113 my %clusteringpaths=();
5114 my %clustersholder=();
5115 my %foundkeys=();
5116 my %clusteringpathsRev=();
5117
5118
5119 my $totalcount=();
5120 my $founkeys_enteredcount=();
5121 my $transfered=0;
5122 my $complete_transfered=0;
5123 my $plain_transfered=0;
5124 my $existing_removed=0;
5125
5126 while (my $line = <BO>){
5127 # print "x" x 60, "\n" if $prinkter == 1;
5128 next if $line !~ /^[0-9]+/;
5129 #print $line;
5130 chomp $line;
5131 my @fields2 = split(/\t/,$line);
5132 my $key2 = ();
5133 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) {
5134 $key2 = join("\t",$1, $2, $4, $5);
5135 }
5136
5137 else {
5138 # print "seq line $line incompatible\n";
5139 next;
5140 }
5141 # print "KEY = : $key2\n" if $prinkter == 1;
5142
5143
5144 my @currentcontigstarts=();
5145 my @currentcontigends=();
5146 my @currentcontigchrs=();
5147 my @clusters = ();
5148 my @clusterscopy=();
5149 if (exists $contigclusters{$key2}){
5150 @clusters = @{$contigclusters{$key2}};
5151 @clusterscopy=@clusters;
5152 for my $i (0 ... $#tags){
5153 # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
5154 if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){
5155 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
5156 my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
5157 my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
5158 $currentcontigchrs[$i]=$1;
5159 $currentcontigstarts[$i]=$2;
5160 $currentcontigends[$i]=$3;
5161 }
5162 else {
5163 $currentcontigchrs[$i] = 0;
5164 # print "no microsat clusters for $key2\n" if $prinkter == 1; next;
5165 }
5166 }
5167 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
5168
5169 my @sequences = ();
5170 for (0 ... $#tags){
5171 my $seq = <BO>;
5172 # print $seq;
5173 chomp $seq;
5174 push(@sequences , " ".$seq);
5175 }
5176
5177 next if scalar @currentcontigchrs == 0;
5178
5179 # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1;
5180 my %visitedcontigs=();
5181
5182 for my $i (0 ... $#tags){
5183 #1 check if there exists adjacent alignment block wrt coordinates of this species.
5184 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
5185 #2 no need to worry about proximity of anything in adjacent block!
5186 @clusters=@clusterscopy;
5187 #1 BELOW, the following is really to calclate the distance between the end coordinate of the
5188 #2 cluster and the end of the gap-free sequence of each species. this is so that if an
5189 #3 adjacent alignment block is found lateron, the exact distance between the potentially
5190 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
5191 #5 immediately below.
5192 my $firstclusterstart = $contigclustersFirstStartOnly{$key2};
5193 my $lastclusterend = $contigclustersLastEndOnly{$key2};
5194
5195 my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]);
5196 # print "check if exists $key3 in contigends for $i\n" if $prinkter == 1;
5197
5198 if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){
5199 $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies#
5200 # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1;
5201
5202 #1 extract coordinates of the last cluster of this found alignment block
5203 # print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1;
5204 # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1;
5205 # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
5206 # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
5207 my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}};
5208 my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}};
5209 my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=();
5210
5211 for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){
5212 # print "val = $val\n" if $prinkter == 1;
5213 if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){
5214 push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/;
5215 }
5216 }
5217 # print "-----\n" if $prinkter == 1;
5218 for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){
5219 # print "val = $val\n" if $prinkter == 1;
5220 if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){
5221 push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/;
5222 }
5223 }
5224
5225
5226 # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1;
5227 # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1;
5228
5229 # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){
5230 if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){
5231 my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}};
5232 $regurgitate[$#regurgitate]=~s/\n//g;
5233 $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters);
5234 delete $contigclusters{$contigends[$i]{$key3}};
5235 $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ];
5236 delete $contigclusters{$key2};
5237 $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0;
5238 $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0;
5239
5240 if (scalar(@clusters) < 1){
5241 # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2};
5242 $clusteringpaths{$key2}=$contigends[$i]{$key3};
5243 $clusteringpathsRev{$contigends[$i]{$key3}}=$key2;
5244 print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n";
5245 # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n";
5246 $founkeys_enteredcount-- if exists $foundkeys{$key2};
5247 $existing_removed++ if exists $foundkeys{$key2};
5248 # print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1;
5249 delete $foundkeys{$key2} if exists $foundkeys{$key2};
5250 $complete_transfered++;
5251 }
5252 else{
5253 print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2};
5254 $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2};
5255 $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2};
5256
5257 $founkeys_enteredcount++ if !exists $foundkeys{$key2};
5258 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2};
5259 # print "adding foundkeys entry $foundkeys{$key2}\n";
5260 $transfered++;
5261 }
5262 #$contigclusters{$key2}=[ @contigcluster ];
5263 }
5264 }
5265 else{
5266 # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1;
5267 $plain_transfered++;
5268 print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
5269 $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2};
5270 $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
5271 $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
5272 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
5273 # print "adding foundkeys entry $foundkeys{$key2}\n";
5274
5275 }
5276 $totalcount++;
5277
5278 }
5279
5280
5281 }
5282 close BO;
5283 #close (NORTH);
5284 #///////////////////////////////////////////////////////////////////////////////////////
5285 #///////////////////////////////////////////////////////////////////////////////////////
5286 #///////////////////////////////////////////////////////////////////////////////////////
5287 #///////////////////////////////////////////////////////////////////////////////////////
5288
5289 my $founkeys_count=();
5290 my $nopath_count=();
5291 my $pathed_count=0;
5292 foreach my $key2 (keys %foundkeys){
5293 #print "x" x 60, "\n";
5294 # print "x" if $dotcounter % 100 ==0;
5295 # print "\n" if $dotcounter % 5000 ==0;
5296 $founkeys_count++;
5297 my $key = $key2;
5298 # print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1;
5299 if ($clusteringpaths{$key} eq $key){
5300 # print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1;
5301 $nopath_count++;
5302 delete $foundkeys{$key2};
5303 print ORTH join ("\n",@{$contigclusters{$key2}}),"\n";
5304 }
5305 else{
5306 my @pool=();
5307 my $key3=();
5308 $pathed_count++;
5309 # print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1;
5310 # print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1;
5311 if ($clusteringpathsRev{$key} eq "0") {
5312 next;
5313 }
5314 else{
5315 my $yek3 = $clusteringpathsRev{$key};
5316 my $yek = $key;
5317 # print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1;
5318 while ($yek3 ne "0"){
5319 # print "$yek->$yek3," if $prinkter == 1;
5320 $yek = $yek3;
5321 $yek3 = $clusteringpathsRev{$yek};
5322 }
5323 # print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1;
5324 $key3 = $clusteringpaths{$yek};
5325 $key = $yek;
5326 }
5327
5328 # print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1;
5329
5330 while($key ne $key3){
5331 # print "KEEY $key->$key3\n" if $prinkter == 1;
5332 # print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1;
5333
5334 if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}};
5335 # print "now pool = @pool\n" if $prinkter == 1;
5336 }
5337 delete $foundkeys{$key3};
5338 $key=$key3;
5339 $key3=$clusteringpaths{$key};
5340 }
5341 # print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1;
5342 my @firstcontig= @{$contigclusters{$key}};
5343 delete $foundkeys{$key2} if exists $foundkeys{$key2} ;
5344 delete $foundkeys{$key} if exists $foundkeys{$key};
5345
5346 unshift @pool, pop @firstcontig;
5347 # print join("\t",@pool),"\n" if $prinkter == 1;
5348 print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0;
5349 print ORTH join ("\t",@pool),"\n";
5350 # join();
5351 }
5352
5353 }
5354 #close (NORTH);
5355 # print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1;
5356 close (BO);
5357 close (ORTH);
5358 close (OUTP);
5359 return 1;
5360
5361 }
5362 sub stringPainter{
5363 my @string = split(/_C0C_/,$_[0]);
5364 # print $_[0], " <- in stringPainter\n";
5365 # print $_[1], " <- in clusters\n";
5366
5367 my @clusters = split(/,/, $_[1]);
5368 for my $i (0 ... $#clusters){
5369 my $cluster = $clusters[$i];
5370 # print "cluster = $cluster\n";
5371 my @parts = split(/\./,$cluster);
5372 my @cord = split(/:|-/,shift(@parts));
5373 my $minstart = $cord[1];
5374 my $maxend = $cord[2];
5375 # print "minstart = $minstart , maxend = $maxend\n";
5376
5377 for my $j (0 ... $#parts){
5378 # print "oing thri $parts[$j]\n";
5379 my @cord = split(/:|-/,$parts[$j]);
5380 $minstart = $cord[1] if $cord[1] < $minstart;
5381 $maxend = $cord[2] if $cord[2] > $maxend;
5382 }
5383 # print "minstart = $minstart , maxend = $maxend\n";
5384 for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;}
5385
5386
5387 }
5388 # print "@string <-done from function stringPainter\n";
5389 return join("_S0S_",@string);
5390 }
5391
5392 sub findClusters{
5393 my $continue = 0;
5394 my @mapped_clusters = ();
5395 my $clusterdist = $_[1];
5396 my $previous = 'x';
5397 my @localcluster = ();
5398 my $cluster_starts = ();
5399 my $cluster_ends = ();
5400 my $localcluster_start = ();
5401 my $localcluster_end = ();
5402 my @record_cluster = ();
5403 my @string = split(/\!/, $_[0]);
5404 my $zerolength=0;
5405
5406 for my $pos_pos (1 ... $#string){
5407 my $pos = $string[$pos_pos];
5408 # print $pos, "\n";
5409 if ($continue == 0 && $pos eq "x") {next;}
5410
5411 if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){
5412 if ($zerolength == 0) {$localcluster_end = $pos_pos-1};
5413 $zerolength++;
5414 $continue = 1;
5415 }
5416
5417 if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) {
5418 $zerolength = 0;
5419 $continue = 0;
5420 my %seen;
5421 my @uniqed = grep !$seen{$_}++, @localcluster;
5422 # print "caught cluster : @uniqed \n";
5423 push(@mapped_clusters, [@uniqed]);
5424 # print "clustered:\n@uniqed\n";
5425 @localcluster = ();
5426 @record_cluster = ();
5427
5428 }
5429
5430 if ($pos ne "x"){
5431 $zerolength = 0;
5432 $continue = 1;
5433 $pos =~ s/x,//g;
5434 my @entries = split(/,/,$pos);
5435 $localcluster_end = 0;
5436 $localcluster_start = 0;
5437 push(@record_cluster,$pos);
5438
5439 if ($continue == 0){
5440 @localcluster = ();
5441 @localcluster = (@localcluster, @entries);
5442 $localcluster_start = $pos_pos;
5443 }
5444
5445 if ($continue == 1 ) {
5446 @localcluster = (@localcluster, @entries);
5447 }
5448 }
5449 }
5450
5451 if (scalar(@localcluster) > 0){
5452 my %seen;
5453 my @uniqed = grep !$seen{$_}++, @localcluster;
5454 # print "caught cluster : @uniqed \n";
5455 push(@mapped_clusters, [@uniqed]);
5456 # print "clustered:\n@uniqed\n";
5457 @localcluster = ();
5458 @record_cluster = ();
5459 }
5460
5461 my @returner = ();
5462
5463 foreach my $clust (@mapped_clusters){
5464 my @localclust = @$clust;
5465 my @result = ();
5466 foreach my $clustparts (@localclust){
5467 push(@result,$clustparts);
5468 }
5469 push(@returner , join(".",@result));
5470 }
5471 # print "returnig: ", join(",",@returner), "\n";
5472 return join(",",@returner);
5473 }
5474 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
5475
5476 #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
5477
5478 sub MakeTrees{
5479 my $tree = $_[0];
5480 my @parts=($tree);
5481 # my @parts=();
5482
5483 while (1){
5484 $tree =~ s/^\(//g;
5485 $tree =~ s/\)$//g;
5486 my @arr = ();
5487
5488 if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
5489 @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
5490 $tree = $2;
5491 push @parts, $tree;
5492 }
5493 elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
5494 @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
5495 $tree = $1;
5496 push @parts, $tree;
5497 }
5498 elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
5499 last;
5500 }
5501 }
5502 return @parts;
5503 }
5504
5505 #xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx
5506
5507 sub qualityFilter{
5508 my $unmaskedorthfile = $_[0];
5509 my $seqfile = $_[1];
5510 my $maskedorthfile = $_[2];
5511 my $filteredout = $maskedorthfile."_residue";
5512 open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!";
5513
5514 my %keyhash = ();
5515
5516 while (my $line = <PMORTH>){
5517 my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/;
5518 push @{$keyhash{$key}}, $line;
5519 }
5520
5521 open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!";
5522 open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!";
5523 open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!";
5524
5525
5526
5527 while (my $line = <SEQ>){
5528 chomp $line;
5529 if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){
5530 my $key = join("\t", $1,$2,$3,$4);
5531 next if !exists $keyhash{$key};
5532 my @orths = @{$keyhash{$key}} if exists $keyhash{$key};
5533 delete $keyhash{$key};
5534
5535 my $sine = <SEQ>;
5536
5537 foreach my $orth (@orths){
5538 #print "-----------------------------------------------------------------\n";
5539 #print $orth;
5540 my $orthcopy = $orth;
5541 $orth =~ s/^>//;
5542 my @parts = split(/>/,$orth);
5543
5544 my @starts = ();
5545 my @ends = ();
5546
5547 foreach my $part (@parts){
5548 my $no_of_species = adjustCoordinates($part);
5549 my @pields = split(/\t/,$part);
5550
5551 # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n";
5552
5553 push @starts, $pields[$startcord];
5554 push @ends, $pields[$endcord];
5555 }
5556
5557 #print "starts = @starts ... ends = @ends\n";
5558
5559 my $leftend = smallest_number(@starts)-10;
5560 my $rightend = largest_number(@ends)+10;
5561
5562 my $maskarea = substr($sine, $leftend, $rightend-$leftend+1);
5563 print RES $orth if $maskarea =~ /#/;
5564
5565
5566 next if $maskarea =~ /#/;
5567
5568 print MORTH $orthcopy;
5569 }
5570 }
5571 else{
5572 next;
5573 }
5574
5575
5576 }
5577
5578 # print "UNDONE: ", scalar(keys %keyhash),"\n";
5579 # print MORTH "UNDONE: ", scalar(keys %keyhash),"\n";
5580
5581 }
5582
5583 sub adjustCoordinates{
5584 my $line = $_[0];
5585 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
5586 my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
5587 # print "line = $line\n";
5588 $infocord = 2 + (4*$no_of_species) - 1;
5589 $typecord = 2 + (4*$no_of_species) + 1 - 1;
5590 $motifcord = 2 + (4*$no_of_species) + 2 - 1;
5591 $gapcord = $motifcord+1;
5592 $startcord = $gapcord+1;
5593 $strandcord = $startcord+1;
5594 $endcord = $strandcord + 1;
5595 $microsatcord = $endcord + 1;
5596 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
5597 $interr_poscord = $microsatcord + 3;
5598 $no_of_interruptionscord = $microsatcord + 4;
5599 $interrcord = $microsatcord + 2;
5600 # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
5601 return $no_of_species;
5602 }
5603
5604
5605
5606