0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Term::ANSIColor;
|
|
5 use File::Basename;
|
|
6 use IO::Handle;
|
|
7 use Cwd;
|
|
8 use File::Path;
|
|
9 use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter);
|
|
10 use File::Path qw(make_path remove_tree);
|
|
11 use File::Temp qw/ tempfile tempdir /;
|
|
12 my $tdir = tempdir( CLEANUP => 1 );
|
|
13 chdir $tdir;
|
|
14 my $dir = getcwd;
|
|
15 #print "dir = $dir\n";
|
|
16
|
|
17 #$ENV{'PATH'} .= ':' . dirname($0);
|
|
18 my $date = `date`;
|
|
19
|
|
20 my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV;
|
|
21 if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; }
|
|
22
|
|
23 $tree_definition_all =~ s/\s+//g;
|
|
24 $threshold_array =~ s/\s+//g;
|
|
25 $allspeciesin =~ s/\s+//g;
|
|
26 #-------------------------------------------------------------------------------
|
|
27 # WHICH SPUTNIK USED?
|
|
28 my $sputnikpath = ();
|
|
29 $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ;
|
|
30 #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC";
|
|
31 #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath;
|
|
32 #exit if !-e $sputnikpath;
|
|
33 #$sputnikpath = "bx-sputnik" ;
|
|
34 #print "ARGV input = @ARGV\n";
|
|
35 #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n";
|
|
36 #-------------------------------------------------------------------------------
|
|
37 # RUNFILE
|
|
38 #-------------------------------------------------------------------------------
|
|
39 $distance = 1; #bp
|
|
40 $distance++;
|
|
41 my @tree_definitions=MakeTrees($tree_definition_all);
|
|
42 my $allspeciesset = $tree_definition_all;
|
|
43 $allspeciesset =~ s/[\(\) ]+//g;
|
|
44 @allspecies = split(/,/,$allspeciesset);
|
|
45
|
|
46 my @outputfiles = ();
|
|
47 my $round = 0;
|
|
48 #my $tdir = tempdir( CLEANUP => 0 );
|
|
49 #chdir $tdir;
|
|
50
|
|
51 foreach my $tree_definition (@tree_definitions){
|
|
52 my @commas = ($tree_definition =~ /,/g) ;
|
|
53 #print "commas = @commas\n"; <STDIN>;
|
|
54 next if scalar(@commas) <= 1;
|
|
55 #print "species_set = $species_set\n";
|
|
56 $treeSpeciesNum = scalar(@commas) + 1;
|
|
57 $species_set = $tree_definition;
|
|
58 $species_set =~ s/[\)\( ;]+//g;
|
|
59 #print "species_set = $species_set\n"; <STDIN>;
|
|
60
|
|
61 $round++;
|
|
62 #-------------------------------------------------------------------------------
|
|
63 # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP)
|
|
64 $threshold_array=~ s/,/_/g;
|
|
65 my @thresharr = split("_",$threshold_array);
|
|
66 @thresholds=@thresharr;
|
|
67 #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold));
|
|
68 #print "current dit=$dir\n";
|
|
69 #-------------------------------------------------------------------------------
|
|
70 # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY
|
|
71 my @chrfiles=();
|
|
72
|
|
73 #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0];
|
|
74 my $chromt=int(rand(10000));
|
|
75 my $p_chr=$chromt;
|
|
76
|
|
77
|
|
78 my @exactspeciesset_unarranged = split(/,/,$species_set);
|
|
79 $tree_definition=~s/[\)\(, ]/\t/g;
|
|
80 my @treespecies=split(/\t+/,$tree_definition);
|
|
81 my @exactspecies=();
|
|
82
|
|
83 foreach my $spec (@treespecies){
|
|
84 foreach my $espec (@exactspeciesset_unarranged){
|
|
85 push @exactspecies, $spec if $spec eq $espec;
|
|
86 }
|
|
87 }
|
|
88 #print "exactspecies=@exactspecies\n";
|
|
89 $focalspec = $exactspecies[0];
|
|
90 my $arranged_species_set=join(".",@exactspecies);
|
|
91 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt");
|
|
92 my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt");
|
|
93 #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n";
|
|
94 maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set);
|
|
95 maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set);
|
|
96 #print "done maf to axt conversion\n";
|
|
97 my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt");
|
|
98 artificial_axdata_inverter ($chr_name, $reverse_chr_name);
|
|
99 #print "reverse_chr_name=$reverse_chr_name\n";
|
|
100 #-------------------------------------------------------------------------------
|
|
101 # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT
|
|
102 foreach my $direct ("reverse_direction","forward_direction"){
|
|
103 $p_chr=$chromt;
|
|
104 #print "direction = $direct\n";
|
|
105 $p_chr = $p_chr."r" if $direct eq "reverse_direction";
|
|
106 $p_chr = $p_chr if $direct eq "forward_direction";
|
|
107 my $config = $species_set;
|
|
108 $config=~s/,/./g;
|
|
109 my @orgs = split(/\./,$arranged_species_set);
|
|
110 #print "ORGS= @orgs\n";
|
|
111 my @tag=@orgs;
|
|
112
|
|
113
|
|
114 my $tags = join(",", @tag);
|
|
115 my @tags=@tag;
|
|
116 chomp $p_chr;
|
|
117 $tags = join("_", split(/,/, $tags));
|
|
118 my $pchr = "chr".$p_chr;
|
|
119
|
|
120 my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array;
|
|
121 my @sp_tags = ();
|
|
122
|
|
123 # print "$ptag _ orthfile\n"; <STDIN>;
|
|
124 #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n";
|
|
125 foreach my $sp (@tag){
|
|
126 push(@sp_tags, ($sp.".".$ptag));
|
|
127 }
|
|
128
|
|
129 my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]);
|
|
130 my @presp_tags = ();
|
|
131
|
|
132 foreach my $sp (@tag){
|
|
133 push(@presp_tags, ($sp.".".$preptag));
|
|
134 }
|
|
135
|
|
136 my $resultdir = "";
|
|
137 my $orthdir = "";
|
|
138 my $filtereddir = "";
|
|
139 my $pipedir = "";
|
|
140
|
|
141 my @title_queries = ();
|
|
142 push(@title_queries, "^[0-9]+");
|
|
143 my $sep="\\s";
|
|
144 for my $or (0 ... $#orgs){
|
|
145 my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]"));
|
|
146 #$title =~ s/chr\\+\\s+\+/chr/g;
|
|
147 push(@title_queries, $title);
|
|
148 }
|
|
149 my $title_query = join($sep, @title_queries);
|
|
150 #print "title_queries=@title_queries\n";
|
|
151 #print "query = >$title_query<\n";
|
|
152 #print "orgs = @orgs\n";
|
|
153 #-------------------------------------------------------------------------------
|
|
154 # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES
|
|
155 my $t1input = $pchr.".".$arranged_species_set.".net.axt";
|
|
156
|
|
157 my @t1outputs = ();
|
|
158
|
|
159 foreach my $sp (@presp_tags){
|
|
160 push(@t1outputs, $sp."_gap_op");
|
|
161 }
|
|
162
|
|
163
|
|
164
|
|
165 multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query);
|
|
166 #print "t1outputs=@t1outputs\n";
|
|
167 #print "done t1\n"; <STDIN>;
|
|
168 #-------------------------------------------------------------------------------
|
|
169 #START T2.PL
|
|
170
|
|
171 my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = ();
|
|
172
|
|
173 for my $t (0 ... scalar(@tags)-1){
|
|
174 multi_species_t2($t1outputs[$t], $tag[$t]);
|
|
175 }
|
|
176 #-------------------------------------------------------------------------------
|
|
177 #START T2.2.PL
|
|
178
|
|
179 my @temp_tags = @tag;
|
|
180
|
|
181 foreach my $sp (@presp_tags){
|
|
182 my $t2input = $sp."_nogap_op_unrand";
|
|
183 multi_species_t2_2($t2input, shift(@temp_tags));
|
|
184 }
|
|
185 undef (@temp_tags);
|
|
186
|
|
187 #-------------------------------------------------------------------------------
|
|
188 #START SPUTNIK
|
|
189
|
|
190 my @jobIDs = ();
|
|
191 @temp_tags = @tag;
|
|
192 my @sput_filelist = ();
|
|
193
|
|
194 foreach my $sp (@presp_tags){
|
|
195 #print "sp = $sp\n";
|
|
196 my $sputnikoutput = $pipedir.$sp."_sput_op0";
|
|
197 my $sputnikinput = $pipedir.$sp."_nogap_op_unrand";
|
|
198 push(@sput_filelist, $sputnikinput);
|
|
199 my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput;
|
|
200 # print "$sputnikcommand\n";
|
|
201 my @sputnikcommand_system = $sputnikcommand;
|
|
202 system(@sputnikcommand_system);
|
|
203 }
|
|
204
|
|
205 #-------------------------------------------------------------------------------
|
|
206 #START SPUTNIK OUTPUT CORRECTOR
|
|
207
|
|
208 foreach my $sp (@presp_tags){
|
|
209 my $corroutput = $pipedir.$sp."_sput_op1";
|
|
210 my $corrinput = $pipedir.$sp."_sput_op0";
|
|
211 sputnikoutput_corrector($corrinput,$corroutput);
|
|
212
|
|
213 my $t4output = $pipedir.$sp."_sput_op2";
|
|
214 multi_species_t4($corroutput,$t4output);
|
|
215
|
|
216 my $t5output = $pipedir.$sp."_sput_op3";
|
|
217 multi_species_t5($t4output,$t5output);
|
|
218 #print "done t5.pl for $sp\n";
|
|
219
|
|
220 my $t6output = $pipedir.$sp."_sput_op4";
|
|
221 multi_species_t6($t5output,$t6output,scalar(@orgs));
|
|
222 }
|
|
223 #-------------------------------------------------------------------------------
|
|
224 #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING
|
|
225
|
|
226 foreach my $sp (@presp_tags){
|
|
227 my $t9output = $pipedir.$sp."_gap_op_unrand_match";
|
|
228 my $t9sequence = $pipedir.$sp."_gap_op_unrand2";
|
|
229 my $t9micro = $pipedir.$sp."_sput_op4";
|
|
230 t9($t9micro,$t9sequence,$t9output);
|
|
231
|
|
232 my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match";
|
|
233 my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2";
|
|
234 t9($t9micro,$t9sequence2,$t9output2);
|
|
235 }
|
|
236 #print "done both t9.pl for all orgs\n";
|
|
237
|
|
238 #-------------------------------------------------------------------------------
|
|
239 # FIND COMPOUND MICROSATELLITES
|
|
240
|
|
241 @jobIDs = ();
|
|
242 my $species_counter = 0;
|
|
243
|
|
244 foreach my $sp (@presp_tags){
|
|
245 my $simple_microsats=$pipedir.$sp."_sput_op4_simple";
|
|
246 my $compound_microsats=$pipedir.$sp."_sput_op4_compound";
|
|
247 my $input_micro = $pipedir.$sp."_sput_op4";
|
|
248 my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match";
|
|
249 multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array );
|
|
250 $species_counter++;
|
|
251 }
|
|
252
|
|
253 #-------------------------------------------------------------------------------
|
|
254 # READING AND FILTERING SIMPLE MICROSATELLITES
|
|
255 my $spcounter2=0;
|
|
256 foreach my $sp (@sp_tags){
|
|
257 my $presp = $presp_tags[$spcounter2];
|
|
258 $spcounter2++;
|
|
259 my $simple_microsats=$pipedir.$presp."_sput_op4_simple";
|
|
260 my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered";
|
|
261 my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue";
|
|
262 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
|
|
263 }
|
|
264
|
|
265 #-------------------------------------------------------------------------------
|
|
266 # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS
|
|
267
|
|
268 $species_counter = 0;
|
|
269 foreach my $sp (@sp_tags){
|
|
270 my $presp = $presp_tags[$species_counter];
|
|
271 my $compound_microsats = $pipedir.$presp."_sput_op4_compound";
|
|
272 my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted";
|
|
273 my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure";
|
|
274 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
|
|
275 multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags));
|
|
276 $species_counter++;
|
|
277 }
|
|
278 #-------------------------------------------------------------------------------
|
|
279 # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM..
|
|
280 $species_counter = 0;
|
|
281
|
|
282 foreach my $sp (@sp_tags){
|
|
283 my $presp = $presp_tags[$species_counter];
|
|
284 my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure";
|
|
285 my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
|
|
286 my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound";
|
|
287 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match";
|
|
288 multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8");
|
|
289 $species_counter++;
|
|
290 }
|
|
291 #-------------------------------------------------------------------------------
|
|
292 # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES
|
|
293 $species_counter = 0;
|
|
294
|
|
295 foreach my $sp (@sp_tags){
|
|
296 my $presp = $presp_tags[$species_counter];
|
|
297
|
|
298 my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted";
|
|
299 my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
|
|
300 my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue";
|
|
301 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
|
|
302
|
|
303 my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted";
|
|
304 my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
|
|
305 my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue";
|
|
306 multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags));
|
|
307
|
|
308 my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound";
|
|
309 my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered";
|
|
310 my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue";
|
|
311 multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags));
|
|
312 $species_counter++;
|
|
313 }
|
|
314 #print "done filtering both simple and compound microsatellites \n";
|
|
315
|
|
316 #-------------------------------------------------------------------------------
|
|
317
|
|
318 my @combinedarray = ();
|
|
319 my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide");
|
|
320 my @combinedarray_tags = ("mono", "di", "tri", "tetra");
|
|
321 $species_counter = 0;
|
|
322
|
|
323 foreach my $sp (@sp_tags){
|
|
324 my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple";
|
|
325 push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra";
|
|
326 $species_counter++;
|
|
327 }
|
|
328
|
|
329 #-------------------------------------------------------------------------------
|
|
330 # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION
|
|
331 my $sp_counter = 0;
|
|
332 foreach my $sp (@sp_tags){
|
|
333 my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered";
|
|
334 my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered";
|
|
335 my $simple = $pipedir.$sp."_sput_op4_simple_filtered";
|
|
336 my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple";
|
|
337 `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`;
|
|
338 for my $i (0 ... 3){
|
|
339 `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`;
|
|
340 }
|
|
341 $sp_counter++;
|
|
342 }
|
|
343 #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n";
|
|
344
|
|
345 #-------------------------------------------------------------------------------
|
|
346 # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES
|
|
347 # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES
|
|
348 @temp_tags = @sp_tags;
|
|
349 my $increment = 1000000;
|
|
350 my @splist = ();
|
|
351 my $targetdir = $pipedir;
|
|
352 $species_counter=0;
|
|
353
|
|
354 foreach my $sp (@sp_tags){
|
|
355 my $presp = $presp_tags[$species_counter];
|
|
356 $species_counter++;
|
|
357 my $localtag = shift @temp_tags;
|
|
358 my $locallist = $targetdir.$localtag."_".$p_chr."_list";
|
|
359 push(@splist, $locallist);
|
|
360 my $input = $pipedir.$presp."_nogap_op_unrand2_match";
|
|
361 chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr);
|
|
362 }
|
|
363
|
|
364
|
|
365 my @unionarray = ();
|
|
366 #print "splist=@splist\n";
|
|
367 #-------------------------------------------------------------------------------
|
|
368 # FIND INTERRUPTED MICROSATELLITES
|
|
369
|
|
370 $species_counter = 0;
|
|
371
|
|
372 for my $i (0 .. $#combinedarray){
|
|
373
|
|
374 @jobIDs = ();
|
|
375 open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!";
|
|
376
|
|
377 while (my $sp1 = <JLIST1>){
|
|
378 #print "$splist[$i]: sp1=$sp1\n";
|
|
379 chomp $sp1;
|
|
380
|
|
381 for my $j (0 ... $#combinedarray_tags){
|
|
382 my $interr = $sp1."_interr_".$combinedarray_tags[$j];
|
|
383 my $simple = $sp1."_simple_".$combinedarray_tags[$j];
|
|
384 push @{$unionarray[$i]}, $interr, $simple;
|
|
385 multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8");
|
|
386 }
|
|
387 }
|
|
388 $species_counter++;
|
|
389 }
|
|
390 close JLIST1;
|
|
391 #-------------------------------------------------------------------------------
|
|
392 # REUNION AND ZIPPING BEFORE T10.PL
|
|
393
|
|
394 my @allarray = ();
|
|
395
|
|
396 for my $i (0 ... $#sp_tags){
|
|
397 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
|
|
398 unlink $localfile if -e $localfile;
|
|
399 push(@allarray, $localfile);
|
|
400
|
|
401 my $unfiltered_localfile= $localfile."_unfiltered";
|
|
402 my $residue_localfile= $localfile."_residue";
|
|
403
|
|
404 unlink $unfiltered_localfile;
|
|
405 #unlink $unfiltered_localfile;
|
|
406 for my $j (0 ... $#{$unionarray[$i]}){
|
|
407 #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n";
|
|
408 `cat $unionarray[$i][$j] >> $unfiltered_localfile`;
|
|
409 unlink $unionarray[$i][$j];
|
|
410 }
|
|
411
|
|
412 multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) );
|
|
413 my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered";
|
|
414 my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue";
|
|
415 my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue";
|
|
416
|
|
417 `cat $analyzed_compound >> $localfile`;
|
|
418 }
|
|
419 #-------------------------------------------------------------------------------
|
|
420 # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS
|
|
421
|
|
422 my $toescape=0;
|
|
423
|
|
424
|
|
425 for my $i (0 ... $#sp_tags){
|
|
426 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats";
|
|
427 $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./;
|
|
428 my $direction = $1;
|
|
429 #print "localfile = $localfile , direction = $direction\n";
|
|
430 # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1;
|
|
431 $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/;
|
|
432 last if $direction =~ /chr[0-9a-zA-Z]+r/;
|
|
433 my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match";
|
|
434 my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match";
|
|
435 my $reverselocal = $localfile;
|
|
436 $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g;
|
|
437 merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags));
|
|
438 #-------------------------------------------------------------------------------
|
|
439 my $forward_separate = $localfile."_separate";
|
|
440 my $reverse_separate = $reverselocal."_separate";
|
|
441 my $diff = $forward_separate."_diff";
|
|
442 my $miss = $forward_separate."_miss";
|
|
443 my $common = $forward_separate."_common";
|
|
444 forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags));
|
|
445 #-------------------------------------------------------------------------------
|
|
446 my $symmetrical_file = $localfile."_symmetrical";
|
|
447 my $merged_file = $localfile."_merged";
|
|
448 #print "cating: $merged_file $common into -> $symmetrical_file \n";
|
|
449 `cat $merged_file $common > $symmetrical_file`;
|
|
450 #-------------------------------------------------------------------------------
|
|
451 my $t10output = $symmetrical_file."_fin_hit_all_2";
|
|
452 new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs));
|
|
453 #-------------------------------------------------------------------------------
|
|
454 }
|
|
455 next if $toescape == 1;
|
|
456 #------------------------------------------------------------------------------------------------
|
|
457 # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES
|
|
458
|
|
459
|
|
460 my @micros_array = ();
|
|
461 my $sampletag = ();
|
|
462 for my $i (0 ... $#sp_tags){
|
|
463 my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2";
|
|
464 push(@micros_array, $finhitFile);
|
|
465 $sampletag = $sp_tags[$i];
|
|
466 }
|
|
467 #$sampletag =~ s/^([A-Z]+\.)/ORTH_/;
|
|
468 #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp";
|
|
469 my $orthfiletemp = $ptag."_orthfile";
|
|
470 my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation);
|
|
471
|
|
472 my $maskedorthfiletemp = $ptag."_orthfile_masked";
|
|
473 qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp);
|
|
474
|
|
475 push @outputfiles , $maskedorthfiletemp;
|
|
476 }
|
|
477 $date = `date`;
|
|
478 }
|
|
479
|
|
480 `cat @outputfiles > $orthfile`;
|
|
481
|
|
482 my $rootdir = $dir;
|
|
483 $rootdir =~ s/\/[A-Za-z0-9\-_]+$//;
|
|
484 chdir $rootdir;
|
|
485 remove_tree($dir);
|
|
486
|
|
487 #print "date = $date\n";
|
|
488 #remove_tree($tdir);
|
|
489 #------------------------------------------------------------------------------------------------
|
|
490 #------------------------------------------------------------------------------------------------
|
|
491 #------------------------------------------------------------------------------------------------
|
|
492 #------------------------------------------------------------------------------------------------
|
|
493
|
|
494 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
|
|
495
|
|
496 sub maftoAxt_multispecies {
|
|
497 #print "in maftoAxt_multispecies : got @_\n";
|
|
498 my $fname=$_[0];
|
|
499 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n";
|
|
500 my $treedefinition = $_[1];
|
|
501 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n";
|
|
502 my $counter = 0;
|
|
503 my $exactspeciesset = $_[3];
|
|
504 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset);
|
|
505
|
|
506 $treedefinition=~s/[\)\(, ]/\t/g;
|
|
507 my @species=split(/\t+/,$treedefinition);
|
|
508 my @exactspecies=();
|
|
509
|
|
510 foreach my $spec (@species){
|
|
511 foreach my $espec (@exactspeciesset_unarranged){
|
|
512 push @exactspecies, $spec if $spec eq $espec;
|
|
513 }
|
|
514 }
|
|
515 #print "exactspecies=@exactspecies\n";
|
|
516
|
|
517 ###########
|
|
518 my $select = 2;
|
|
519 #select = 1 if all species need sequences to be present for each block otherwise, it is 0
|
|
520 #select = 2 only the allowed set make up the alignment. use the removeset
|
|
521 # information to detect alignmenets that have other important genomes aligned.
|
|
522 ###########
|
|
523 my @allowedset = ();
|
|
524 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0;
|
|
525 @allowedset = join("_",0,@species) if $select == 1;
|
|
526 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n";
|
|
527 @allowedset = join("_",0,@exactspecies) if $select == 2;
|
|
528 #print "allowedset = @allowedset and exactspecies = @exactspecies\n";
|
|
529
|
|
530 my $start = 0;
|
|
531 my @sequences = ();
|
|
532 my @titles = ();
|
|
533 my $species_counter = "0";
|
|
534 my $countermatch = 0;
|
|
535 my $outsideSpecies=0;
|
|
536
|
|
537 while(my $line = <IN>){
|
|
538 # print $line;
|
|
539 next if $line =~ /^#/;
|
|
540 next if $line =~ /^i/;
|
|
541 chomp $line;
|
|
542 my @fields = split(/\s+/,$line);
|
|
543 chomp $line;
|
|
544 if ($line =~ /^a /){
|
|
545 $start = 1;
|
|
546 }
|
|
547
|
|
548 if ($line =~ /^s /){
|
|
549
|
|
550 foreach my $sp (@allspecies){
|
|
551 # print "checking species $sp\n";
|
|
552 if ($fields[1] =~ /$sp/){
|
|
553 $species_counter = $species_counter."_".$sp;
|
|
554 push(@sequences, $fields[6]);
|
|
555 my @sp_info = split(/\./,$fields[1]);
|
|
556 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]);
|
|
557 push(@titles, $title);
|
|
558 # print "species_counter = $species_counter\n";
|
|
559 }
|
|
560 }
|
|
561 }
|
|
562
|
|
563 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){
|
|
564 # print "species_counter = $species_counter\n";
|
|
565 my $arranged = reorderSpecies($species_counter, @allspecies);
|
|
566 my $stopper = 1;
|
|
567 my $arrno = 0;
|
|
568
|
|
569 # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n";
|
|
570 if (scalar(@sequences) == scalar(@exactspecies)){
|
|
571 foreach my $set (@allowedset){
|
|
572 # print "testing $arranged against $set\n";
|
|
573 if ($arranged eq $set){
|
|
574 $stopper = 0; last;
|
|
575 }
|
|
576 $arrno++;
|
|
577 }
|
|
578 }
|
|
579 else{
|
|
580 $stopper = 1;
|
|
581 }
|
|
582
|
|
583
|
|
584 if ($stopper == 0) {
|
|
585 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged;
|
|
586 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged;
|
|
587 my $filteredseq = filter_gaps(@sequences);
|
|
588
|
|
589 if ($filteredseq ne "SHORT"){
|
|
590 #print "printing"; <STDIN>;
|
|
591 $counter++;
|
|
592 print OUT join (" ",$counter, @titles), "\n";
|
|
593 print OUT $filteredseq, "\n";
|
|
594 print OUT "\n";
|
|
595 $countermatch++;
|
|
596 }
|
|
597 }
|
|
598 else{ #print "nexting\n";<STDIN>;
|
|
599 }
|
|
600
|
|
601 @sequences = (); @titles = (); $start = 0;$species_counter = "0";
|
|
602 next;
|
|
603
|
|
604 }
|
|
605 }
|
|
606 # print "countermatch = $countermatch\n";
|
|
607 }
|
|
608
|
|
609 sub reorderSpecies{
|
|
610 my @inarr=@_;
|
|
611 my $currSpecies = shift (@inarr);
|
|
612 my $ordered_species = 0;
|
|
613 my @species=@inarr;
|
|
614 #print "species = @species\n";
|
|
615 foreach my $order (@species){
|
|
616 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/;
|
|
617 }
|
|
618 return $ordered_species;
|
|
619
|
|
620 }
|
|
621
|
|
622 sub filter_gaps{
|
|
623 my @sequences = @_;
|
|
624 # print "sequences sent are @sequences\n";
|
|
625 my $seq_length = length($sequences[0]);
|
|
626 my $seq_no = scalar(@sequences);
|
|
627 my $allgaps = ();
|
|
628 for (1 ... $seq_no){
|
|
629 $allgaps = $allgaps."-";
|
|
630 }
|
|
631
|
|
632 my @seq_array = ();
|
|
633 my $seq_counter = 0;
|
|
634 foreach my $seq (@sequences){
|
|
635 # my @sequence = split(/\s*/,$seq);
|
|
636 $seq_array[$seq_counter] = [split(/\s*/,$seq)];
|
|
637 # push @seq_array, [@sequence];
|
|
638 $seq_counter++;
|
|
639 }
|
|
640 my $g = 0;
|
|
641 while ( $g < $seq_length){
|
|
642 last if (!exists $seq_array[0][$g]);
|
|
643 my $bases = ();
|
|
644 for my $u (0 ... $#seq_array){
|
|
645 $bases = $bases.$seq_array[$u][$g];
|
|
646 }
|
|
647 # print $bases, "\n";
|
|
648 if ($bases eq $allgaps){
|
|
649 # print "bases are $bases, position is $g \n";
|
|
650 for my $seq (@seq_array){
|
|
651 splice(@$seq , $g, 1);
|
|
652 }
|
|
653 }
|
|
654 else {
|
|
655 $g++;
|
|
656 }
|
|
657 }
|
|
658
|
|
659 my @outs = ();
|
|
660
|
|
661 foreach my $seq (@seq_array){
|
|
662 push(@outs, join("",@$seq));
|
|
663 }
|
|
664 return "SHORT" if length($outs[0]) <=100;
|
|
665 return (join("\n", @outs));
|
|
666 }
|
|
667
|
|
668
|
|
669 sub allowedSetOfSpecies{
|
|
670 my @allowed_species = split(/_/,$_[0]);
|
|
671 unshift @allowed_species, 0;
|
|
672 # print "allowed set = @allowed_species \n";
|
|
673 my @output = ();
|
|
674 for (0 ... scalar(@allowed_species) - 4){
|
|
675 push(@output, join("_",@allowed_species));
|
|
676 pop @allowed_species;
|
|
677 }
|
|
678 return join(";",reverse(@output));
|
|
679
|
|
680 }
|
|
681
|
|
682
|
|
683 sub orderInfo{
|
|
684 my @info = split(/;/,$_[0]);
|
|
685 # print "info = @info";
|
|
686 my @old = split(/_/,$_[1]);
|
|
687 my @new = split(/_/,$_[2]);
|
|
688 shift @old; shift @new;
|
|
689 my @outinfo = ();
|
|
690 foreach my $spe (@new){
|
|
691 for my $no (0 ... $#old){
|
|
692 if ($spe eq $old[$no]){
|
|
693 push(@outinfo, $info[$no]);
|
|
694 }
|
|
695 }
|
|
696 }
|
|
697 # print "outinfo = @outinfo \n";
|
|
698 return join(";", @outinfo);
|
|
699 }
|
|
700
|
|
701 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx
|
|
702
|
|
703 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
|
|
704 sub artificial_axdata_inverter{
|
|
705 open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!";
|
|
706 open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!";
|
|
707 my $linecounter=0;
|
|
708 while (my $line = <IN>){
|
|
709 $linecounter++;
|
|
710 #print "$linecounter\n";
|
|
711 chomp $line;
|
|
712 my $final_line = $line;
|
|
713 my $trycounter = 0;
|
|
714 if ($line =~ /^[a-zA-Z\-]/){
|
|
715 # while ($final_line eq $line){
|
|
716 my @fields = split(/\s*/,$line);
|
|
717
|
|
718 $final_line = join("",reverse(@fields));
|
|
719 # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
|
|
720 # $trycounter++;
|
|
721 # print "trying again....$trycounter : $final_line\n" if $final_line eq $line;
|
|
722 # }
|
|
723 }
|
|
724
|
|
725 # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
|
|
726 if ($line =~ /^[0-9]/){
|
|
727 $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g;
|
|
728 $final_line = $line;
|
|
729 }
|
|
730 print OUT $final_line,"\n";
|
|
731 #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/;
|
|
732 }
|
|
733 close OUT;
|
|
734 }
|
|
735 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx
|
|
736
|
|
737
|
|
738 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
|
|
739
|
|
740 sub multi_species_t1 {
|
|
741
|
|
742 my $input1 = $_[0];
|
|
743 #print "@_\n"; <STDIN>;
|
|
744 my @tags = split(/_/, $_[1]);
|
|
745 my @outputs = split(/,/, $_[2]);
|
|
746 my $title_query = $_[3];
|
|
747 my @handles = ();
|
|
748
|
|
749 open(FILEB,"<$input1")or die "Cannot open file: $input1 $!";
|
|
750 my $i = 0;
|
|
751 foreach my $path (@outputs){
|
|
752 $handles[$i] = IO::Handle->new();
|
|
753 open ($handles[$i], ">$path") or die "Can't open $path : $!";
|
|
754 $i++;
|
|
755 }
|
|
756
|
|
757 my $curdef;
|
|
758 my $start = 0;
|
|
759
|
|
760 while (my $line = <FILEB> ) {
|
|
761 if ($line =~ /^\d/){
|
|
762 $line =~ s/ +/\t/g;
|
|
763 my @fields = split(/\s+/, $line);
|
|
764 if (($line =~ /$title_query/)){
|
|
765 my $title = $line;
|
|
766 my $counter = 0;
|
|
767 foreach my $tag (@tags){
|
|
768 $line = <FILEB>;
|
|
769 print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line;
|
|
770 $counter++;
|
|
771 }
|
|
772 }
|
|
773 else{
|
|
774 foreach my $tag (@tags){
|
|
775 my $tine = <FILEB>;
|
|
776 }
|
|
777 }
|
|
778
|
|
779 }
|
|
780 }
|
|
781
|
|
782 foreach my $hand (@handles){
|
|
783 $hand->close();
|
|
784 }
|
|
785
|
|
786 close FILEB;
|
|
787 }
|
|
788
|
|
789 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx
|
|
790
|
|
791 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
|
|
792
|
|
793 sub multi_species_t2{
|
|
794
|
|
795 my $input = $_[0];
|
|
796 my $species = $_[1];
|
|
797 my $output1 = $input."_unr";
|
|
798
|
|
799 #------------------------------------------------------------------------------------------
|
|
800 open (FILEF1, "<$input") or die "Cannot open file $input :$!";
|
|
801 open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!";
|
|
802
|
|
803 my $line1 = <FILEF1>;
|
|
804
|
|
805 while($line1){
|
|
806 {
|
|
807 # chomp($line);
|
|
808 if ($line1 =~ (m/^\>$species/)){
|
|
809 chomp($line1);
|
|
810 print FILEF2 $line1;
|
|
811 $line1 = <FILEF1>;
|
|
812 chomp($line1);
|
|
813 print FILEF2 "\t", $line1,"\n";
|
|
814 }
|
|
815 }
|
|
816 $line1 = <FILEF1>;
|
|
817 }
|
|
818
|
|
819 close FILEF1;
|
|
820 close FILEF2;
|
|
821 #------------------------------------------------------------------------------------------
|
|
822
|
|
823 my $output2 = $output1."and";
|
|
824 my $output3 = $output1."and2";
|
|
825 open(IN,"<$output1");
|
|
826 open (FILEF3, ">$output2");
|
|
827 open (FILEF4, ">$output3");
|
|
828
|
|
829
|
|
830 while (<IN>){
|
|
831 my $line = $_;
|
|
832 chomp($line);
|
|
833 my @fields=split (/\t/, $line);
|
|
834 # print $line,"\n"; <STDIN>;
|
|
835 if($line !~ /random/){
|
|
836 print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n";
|
|
837 print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n";
|
|
838 }
|
|
839 }
|
|
840
|
|
841
|
|
842 close IN;
|
|
843 close FILEF3;
|
|
844 close FILEF4;
|
|
845 unlink $output1;
|
|
846
|
|
847 #------------------------------------------------------------------------------------------
|
|
848 # OLD T3.PL RUDIMENT
|
|
849
|
|
850 my $t3output = $output2;
|
|
851 $t3output =~ s/gap_op_unrand/nogap_op_unrand/g;
|
|
852
|
|
853 open(IN,"<$output2");
|
|
854 open(OUTA,">$t3output");
|
|
855
|
|
856
|
|
857 while (<IN>){
|
|
858 s/-//g unless /^>/;
|
|
859 print OUTA;
|
|
860 }
|
|
861
|
|
862 close IN;
|
|
863 close OUTA;
|
|
864 #------------------------------------------------------------------------------------------
|
|
865 }
|
|
866 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx
|
|
867
|
|
868
|
|
869 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx
|
|
870 sub multi_species_t2_2{
|
|
871 #print "IN multi_species_t2_2 : @_\n";
|
|
872 my $input = $_[0];
|
|
873 my $species = $_[1];
|
|
874 my $output1 = $input."2";
|
|
875
|
|
876
|
|
877 open (FILEF1, "<$input");
|
|
878 open (FILEF2, ">$output1");
|
|
879
|
|
880 my $line1 = <FILEF1>;
|
|
881
|
|
882 while($line1){
|
|
883 {
|
|
884 # chomp($line);
|
|
885 if ($line1 =~ (m/^\>$species/)){
|
|
886 chomp($line1);
|
|
887 print FILEF2 $line1;
|
|
888 $line1 = <FILEF1>;
|
|
889 chomp($line1);
|
|
890 print FILEF2 "\t", $line1,"\n";
|
|
891 }
|
|
892 }
|
|
893 $line1 = <FILEF1>;
|
|
894 }
|
|
895
|
|
896 close FILEF1;
|
|
897 close FILEF2;
|
|
898 }
|
|
899
|
|
900 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx
|
|
901
|
|
902
|
|
903 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
|
|
904 sub sputnikoutput_corrector{
|
|
905 my $input = $_[0];
|
|
906 my $output = $_[1];
|
|
907 open(IN,"<$input") or die "Cannot open file $input :$!";
|
|
908 open(OUT,">$output") or die "Cannot open file $output :$!";
|
|
909 my $tine;
|
|
910 while (my $line=<IN>){
|
|
911 if($line =~/length /){
|
|
912 $tine = $line;
|
|
913 $tine =~ s/\s+/\t/g;
|
|
914 my @fields = split(/\t/,$tine);
|
|
915 if ($fields[6] > 60){
|
|
916 print OUT $line;
|
|
917 $line = <IN>;
|
|
918
|
|
919 while (($line !~ /nucleotide/) && ($line !~ /^>/)){
|
|
920 chomp $line;
|
|
921 print OUT $line;
|
|
922 $line = <IN>;
|
|
923 }
|
|
924 print OUT "\n";
|
|
925 print OUT $line;
|
|
926 }
|
|
927 else{
|
|
928 print OUT $line;
|
|
929 }
|
|
930 }
|
|
931 else{
|
|
932 print OUT $line;
|
|
933 }
|
|
934 }
|
|
935 close IN;
|
|
936 close OUT;
|
|
937 }
|
|
938 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx
|
|
939
|
|
940
|
|
941 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
|
|
942 sub multi_species_t4{
|
|
943 # print "multi_species_t4 : @_\n";
|
|
944 my $input = $_[0];
|
|
945 my $output = $_[1];
|
|
946 open (FILEA, "<$input");
|
|
947 open (FILEB, ">$output");
|
|
948
|
|
949 my $line = <FILEA>;
|
|
950
|
|
951 while ($line) {
|
|
952 # chomp $line;
|
|
953 if ($line =~ />/) {
|
|
954 chomp $line;
|
|
955 print FILEB $line, "\n";
|
|
956 }
|
|
957
|
|
958
|
|
959 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
|
|
960 chomp $line;
|
|
961 print FILEB $line, " " ;
|
|
962 $line = <FILEA>;
|
|
963 chomp $line;
|
|
964 print FILEB $line,"\n";
|
|
965 }
|
|
966
|
|
967 $line = <FILEA>;
|
|
968 }
|
|
969
|
|
970
|
|
971 close FILEA;
|
|
972 close FILEB;
|
|
973
|
|
974 }
|
|
975
|
|
976 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx
|
|
977
|
|
978
|
|
979 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
|
|
980 sub multi_species_t5{
|
|
981
|
|
982 my $input = $_[0];
|
|
983 my $output = $_[1];
|
|
984
|
|
985 open(FILEB,"<$input");
|
|
986 open(FILEC,">$output");
|
|
987
|
|
988 my $curdef;
|
|
989
|
|
990 while (my $line = <FILEB> ) {
|
|
991
|
|
992 if ($line =~ /^>/){
|
|
993 chomp $line;
|
|
994 $curdef = $line;
|
|
995 next;
|
|
996 }
|
|
997
|
|
998 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){
|
|
999 print FILEC $curdef," ",$line;
|
|
1000 }
|
|
1001
|
|
1002 }
|
|
1003
|
|
1004
|
|
1005 close FILEB;
|
|
1006 close FILEC;
|
|
1007
|
|
1008 }
|
|
1009 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx
|
|
1010
|
|
1011
|
|
1012 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
|
|
1013 sub multi_species_t6{
|
|
1014 my $input = $_[0];
|
|
1015 my $output = $_[1];
|
|
1016 my $focalstrand=$_[3];
|
|
1017 # print "inpput = @_\n";
|
|
1018 open (FILE, "<$input");
|
|
1019 open (FILE_MICRO, ">$output");
|
|
1020 my $linecounter=0;
|
|
1021 while (my $line = <FILE>){
|
|
1022 $linecounter++;
|
|
1023 chomp $line;
|
|
1024 #print "line = $line\n";
|
|
1025 #MONO#
|
|
1026 $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/;
|
|
1027 my $strand=$1;
|
|
1028 my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g);
|
|
1029 #print "line = $line\n";
|
|
1030 my $specfieldsend = 2 + ($no_of_species*4) - 1;
|
|
1031 my @fields = split(/\s+/, $line);
|
|
1032 my @speciesdata = @fields[0 ... $specfieldsend];
|
|
1033 $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/;
|
|
1034 my ($tide, $start, $end) = ($1, $2, $3);
|
|
1035 #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n";
|
|
1036 if($line =~ /mononucleotide/){
|
|
1037 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n";
|
|
1038 }
|
|
1039 #DI#
|
|
1040 elsif($line =~ /dinucleotide/){
|
|
1041 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n";
|
|
1042 }
|
|
1043 #TRI#
|
|
1044 elsif($line =~ /trinucleotide/ ){
|
|
1045 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n";
|
|
1046 }
|
|
1047 #TETRA#
|
|
1048 elsif($line =~ /tetranucleotide/){
|
|
1049 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n";
|
|
1050 }
|
|
1051 #PENTA#
|
|
1052 elsif($line =~ /pentanucleotide/){
|
|
1053 #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n";
|
|
1054 }
|
|
1055 else{
|
|
1056 # print "not: @fields\n";
|
|
1057 }
|
|
1058 }
|
|
1059 # print "linecounter=$linecounter\n";
|
|
1060 close FILE;
|
|
1061 close FILE_MICRO;
|
|
1062 }
|
|
1063
|
|
1064 sub mono {
|
|
1065 my $st = $_[0];
|
|
1066 my $tp = unpack "A1"x(length($st)/1),$st;
|
|
1067 my $var1 = substr($tp, 0, 1);
|
|
1068 return join ("\t", $var1);
|
|
1069 }
|
|
1070 sub di {
|
|
1071 my $st = $_[0];
|
|
1072 my $tp = unpack "A2"x(length($st)/2),$st;
|
|
1073 my $var1 = substr($tp, 0, 2);
|
|
1074 return join ("\t", $var1);
|
|
1075 }
|
|
1076 sub tri {
|
|
1077 my $st = $_[0];
|
|
1078 my $tp = unpack "A3"x(length($st)/3),$st;
|
|
1079 my $var1 = substr($tp, 0, 3);
|
|
1080 return join ("\t", $var1);
|
|
1081 }
|
|
1082 sub tetra {
|
|
1083 my $st = $_[0];
|
|
1084 my $tp = unpack "A4"x(length($st)/4),$st;
|
|
1085 my $var1 = substr($tp, 0, 4);
|
|
1086 return join ("\t", $var1);
|
|
1087 }
|
|
1088 sub penta {
|
|
1089 my $st = $_[0];
|
|
1090 my $tp = unpack "A5"x(length($st)/5),$st;
|
|
1091 my $var1 = substr($tp, 0, 5);
|
|
1092 return join ("\t", $var1);
|
|
1093 }
|
|
1094
|
|
1095 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx
|
|
1096
|
|
1097
|
|
1098 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
|
|
1099 sub t9{
|
|
1100 my $input1 = $_[0];
|
|
1101 my $input2 = $_[1];
|
|
1102 my $output = $_[2];
|
|
1103
|
|
1104
|
|
1105 open(IN1,"<$input1") if -e $input1;
|
|
1106 open(IN2,"<$input2") or die "cannot open file $_[1] : $!";
|
|
1107 open(OUT,">$output") or die "cannot open file $_[2] : $!";
|
|
1108
|
|
1109
|
|
1110 my %seen = ();
|
|
1111 my $prevkey = 0;
|
|
1112
|
|
1113 if (-e $input1){
|
|
1114 while (my $line = <IN1>){
|
|
1115 chomp($line);
|
|
1116 my @fields = split(/\t/,$line);
|
|
1117 my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]);
|
|
1118 # print "key in t9 = $key1\n";
|
|
1119 $seen{$key1}++ if ($prevkey ne $key1) ;
|
|
1120 $prevkey = $key1;
|
|
1121 }
|
|
1122 # print "done first hash\n";
|
|
1123 close IN1;
|
|
1124 }
|
|
1125
|
|
1126 while (my $line = <IN2>){
|
|
1127 # print $line, "**\n";
|
|
1128 if (-e $input1){
|
|
1129 chomp($line);
|
|
1130 my @fields = split(/\t/,$line);
|
|
1131 my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]);
|
|
1132 if (exists $seen{$key2}){
|
|
1133 print OUT "$line\n" ;
|
|
1134 delete $seen{$key2};
|
|
1135 }
|
|
1136 }
|
|
1137 else {
|
|
1138 print OUT "$line\n" ;
|
|
1139 # print "$line\n" ;
|
|
1140 }
|
|
1141 }
|
|
1142
|
|
1143 close IN2;
|
|
1144 close OUT;
|
|
1145 }
|
|
1146 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx
|
|
1147
|
|
1148
|
|
1149 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
|
|
1150
|
|
1151
|
|
1152 sub multiSpecies_compound_microsat_hunter3{
|
|
1153
|
|
1154 my $input1 = $_[0]; ###### the *_sput_op4_ii file
|
|
1155 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
|
|
1156 my $output1 = $_[2]; ###### plain microsatellite file
|
|
1157 my $output2 = $_[3]; ###### compound microsatellite file
|
|
1158 my $org = $_[4]; ###### 1 or 2
|
|
1159 $no_of_species = $_[5];
|
|
1160 #print "IN multiSpecies_compound_microsat_hunter3: @_\n";
|
|
1161 #my @tags = split(/\t/,$info);
|
|
1162 sub compoundify;
|
|
1163 open(IN,"<$input1") or die "Cannot open file $input1 $!";
|
|
1164 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
|
|
1165 open(OUT,">$output1") or die "Cannot open file $output1 $!";
|
|
1166 open(OUT2,">$output2") or die "Cannot open file $output2 $!";
|
|
1167 $infocord = 2 + (4*$no_of_species) - 1;
|
|
1168 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
1169 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
1170 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
1171 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
1172 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
1173 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
1174
|
|
1175 my @thresholds = ("0");
|
|
1176 push(@thresholds, split(/_/,$_[6]));
|
|
1177 sub thresholdCheck;
|
|
1178 my %micros = ();
|
|
1179 while (my $line = <IN>){
|
|
1180 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
1181 next if $line =~ /\t\t/;
|
|
1182 if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
1183 my $key = join("\t",$1, $2, $3, $4, $5);
|
|
1184 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
1185 push (@{$micros{$key}},$line);
|
|
1186 }
|
|
1187 else{
|
|
1188 }
|
|
1189 }
|
|
1190 close IN;
|
|
1191 my @deletedlines = ();
|
|
1192
|
|
1193 my $linecount = 0;
|
|
1194
|
|
1195 while(my $sine = <SEQ>){
|
|
1196 my %microstart=();
|
|
1197 my %microend=();
|
|
1198
|
|
1199 my @sields = split(/\t/,$sine);
|
|
1200
|
|
1201 my $key = ();
|
|
1202
|
|
1203 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
1204 $key = join("\t",$1, $2, $3, $4, $5);
|
|
1205 # print $key, "<-<-<-<-<-<-<-<\n";
|
|
1206 }
|
|
1207 else{
|
|
1208 }
|
|
1209
|
|
1210 if (exists $micros{$key}){
|
|
1211 $linecount++;
|
|
1212 my @microstring = @{$micros{$key}};
|
|
1213 my @tempmicrostring = @{$micros{$key}};
|
|
1214
|
|
1215 foreach my $line (@tempmicrostring){
|
|
1216 my @fields = split(/\t/,$line);
|
|
1217 my $start = $fields[$startcord];
|
|
1218 my $end = $fields[$endcord];
|
|
1219 push (@{$microstart{$start}},$line);
|
|
1220 push (@{$microend{$end}},$line);
|
|
1221 }
|
|
1222 my $firstflag = 'down';
|
|
1223 while( my $line =shift(@microstring)){
|
|
1224 # print "-----------\nline = $line ";
|
|
1225 chomp $line;
|
|
1226 my @fields = split(/\t/,$line);
|
|
1227 my $start = $fields[$startcord];
|
|
1228 my $end = $fields[$endcord];
|
|
1229 my $startmicro = $line;
|
|
1230 my $endmicro = $line;
|
|
1231
|
|
1232 # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n";
|
|
1233
|
|
1234 delete ($microstart{$start});
|
|
1235 delete ($microend{$end});
|
|
1236 my $flag = 'down';
|
|
1237 my $startflag = 'down';
|
|
1238 my $endflag = 'down';
|
|
1239 my $prestart = $start - $distance;
|
|
1240 my $postend = $end + $distance;
|
|
1241 my @compoundlines = ();
|
|
1242 my %compoundhash = ();
|
|
1243 push (@compoundlines, $line);
|
|
1244 push (@{$compoundhash{$line}},$line);
|
|
1245 my $startrank = 1;
|
|
1246 my $endrank = 1;
|
|
1247
|
|
1248 while( ($startflag eq "down") || ($endflag eq "down") ){
|
|
1249 if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) {
|
|
1250 # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n";
|
|
1251 last;
|
|
1252 }
|
|
1253
|
|
1254 $firstflag = "up";
|
|
1255 if ($startflag eq "down"){
|
|
1256 for my $i ($prestart ... $start){
|
|
1257
|
|
1258 if(exists $microend{$i}){
|
|
1259 chomp $microend{$i}[0];
|
|
1260 if(exists $compoundhash{$microend{$i}[0]}) {next;}
|
|
1261 # print "sending from microend $startmicro, $microend{$i}[0] |||\n";
|
|
1262 if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){
|
|
1263 push(@compoundlines, $microend{$i}[0]);
|
|
1264 # print "accepted\n";
|
|
1265 my @tields = split(/\t/,$microend{$i}[0]);
|
|
1266 $startmicro = $microend{$i}[0];
|
|
1267 chomp $startmicro;
|
|
1268 $start = $tields[$startcord];
|
|
1269 $flag = 'down';
|
|
1270 $startrank++;
|
|
1271 # print "startcompund = $microend{$i}[0]\n";
|
|
1272 delete $microend{$i};
|
|
1273 delete $microstart{$start};
|
|
1274 $startflag = 'down';
|
|
1275 $prestart = $start - $distance;
|
|
1276 last;
|
|
1277 }
|
|
1278 else{
|
|
1279 $flag = 'up';
|
|
1280 $startflag = 'up';
|
|
1281 }
|
|
1282 }
|
|
1283 else{
|
|
1284 $flag = 'up';
|
|
1285 $startflag = 'up';
|
|
1286 }
|
|
1287 }
|
|
1288 }
|
|
1289
|
|
1290 $endrank = $startrank;
|
|
1291
|
|
1292 if ($endflag eq "down"){
|
|
1293 for my $i ($end ... $postend){
|
|
1294
|
|
1295 if(exists $microstart{$i} ){
|
|
1296 chomp $microstart{$i}[0];
|
|
1297 if(exists $compoundhash{$microstart{$i}[0]}) {next;}
|
|
1298 # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n";
|
|
1299
|
|
1300 if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){
|
|
1301 push(@compoundlines, $microstart{$i}[0]);
|
|
1302 # print "accepted\n";
|
|
1303 my @tields = split(/\t/,$microstart{$i}[0]);
|
|
1304 $end = $tields[$endcord]-0;
|
|
1305 $endmicro = $microstart{$i}[0];
|
|
1306 $endrank++;
|
|
1307 chomp $endmicro;
|
|
1308 $flag = 'down';
|
|
1309 # print "endcompund = $microstart{$i}[0]\n";
|
|
1310 delete $microstart{$i};
|
|
1311 delete $microend{$end};
|
|
1312 shift @microstring;
|
|
1313 $postend = $end + $distance;
|
|
1314 $endflag = 'down';
|
|
1315 last;
|
|
1316 }
|
|
1317 else{
|
|
1318 $flag = 'up';
|
|
1319 $endflag = 'up';
|
|
1320 }
|
|
1321 }
|
|
1322 else{
|
|
1323 $flag = 'up';
|
|
1324 $endflag = 'up';
|
|
1325 }
|
|
1326 }
|
|
1327 }
|
|
1328 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
|
|
1329 } #end while( $flag eq "down")
|
|
1330 # print "compoundlines = @compoundlines \n";
|
|
1331 if (scalar (@compoundlines) == 1){
|
|
1332 print OUT $line,"\n";
|
|
1333 }
|
|
1334 if (scalar (@compoundlines) > 1){
|
|
1335 my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]);
|
|
1336 # print $compoundline,"\n";
|
|
1337 print OUT2 $compoundline,"\n";
|
|
1338 }
|
|
1339 } #end foreach my $line (@microstring){
|
|
1340 } #if (exists $micros{$key}){
|
|
1341
|
|
1342
|
|
1343 }
|
|
1344
|
|
1345 close OUT;
|
|
1346 close OUT2;
|
|
1347 }
|
|
1348
|
|
1349
|
|
1350 #------------------------------------------------------------------------------------------------
|
|
1351 sub compoundify{
|
|
1352 my ($compoundlines, $sequence) = @_;
|
|
1353 # print "\nfound to compound : @$compoundlines and$sequence \n";
|
|
1354 my $noOfComps = @$compoundlines;
|
|
1355 # print "Number of elements in hash is $noOfComps\n";
|
|
1356 my @starts;
|
|
1357 my @ends;
|
|
1358 foreach my $line (@$compoundlines){
|
|
1359 # print "compoundify.. line = $line \n";
|
|
1360 chomp $line;
|
|
1361 my @fields = split(/\t/,$line);
|
|
1362 my $start = $fields[$startcord];
|
|
1363 my $end = $fields[$endcord];
|
|
1364 # print "start = $start, end = $end \n";
|
|
1365 push(@starts, $start);
|
|
1366 push(@ends,$end);
|
|
1367 }
|
|
1368 my @temp = @$compoundlines;
|
|
1369 my $startline=$temp[0];
|
|
1370 my @mields = split(/\t/,$startline);
|
|
1371 my $startcoord = $mields[$startcord];
|
|
1372 my $startgapsign=$mields[$endcord];
|
|
1373 my @startsorted = sort { $a <=> $b } @starts;
|
|
1374 my @endsorted = sort { $a <=> $b } @ends;
|
|
1375 my @intervals;
|
|
1376 for my $end (0 ... (scalar(@endsorted)-2)){
|
|
1377 my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1));
|
|
1378 push(@intervals,$interval);
|
|
1379 # print "interval = $interval =\n";
|
|
1380 # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n";
|
|
1381 }
|
|
1382 push(@intervals,"");
|
|
1383 my $compoundmicrosat=();
|
|
1384 my $multiunit="";
|
|
1385 foreach my $line (@$compoundlines){
|
|
1386 my @fields = split(/\t/,$line);
|
|
1387 my $component="[".$fields[$microsatcord]."]".shift(@intervals);
|
|
1388 $compoundmicrosat=$compoundmicrosat.$component;
|
|
1389 $multiunit=$multiunit."[".$fields[$motifcord]."]";
|
|
1390 # print "multiunit = $multiunit\n";
|
|
1391 }
|
|
1392 my $compoundcopy = $compoundmicrosat;
|
|
1393 $compoundcopy =~ s/\[|\]//g;
|
|
1394 my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1;
|
|
1395
|
|
1396
|
|
1397 my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit));
|
|
1398 return $compoundline;
|
|
1399 }
|
|
1400
|
|
1401 #------------------------------------------------------------------------------------------------
|
|
1402
|
|
1403 sub identityMatch_thresholdCheck{
|
|
1404 my $line1 = $_[0];
|
|
1405 my $line2 = $_[1];
|
|
1406 my $rank = $_[2];
|
|
1407 my @lields1 = split(/\t/,$line1);
|
|
1408 my @lields2 = split(/\t/,$line2);
|
|
1409 # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n";
|
|
1410
|
|
1411 if (length($lields1[$motifcord]) == length($lields2[$motifcord])){
|
|
1412 my $probe = $lields1[$motifcord].$lields1[$motifcord];
|
|
1413 #print "$probe :: $lields2[$motifcord]\n";
|
|
1414 return "proceed" if $probe =~ /$lields2[$motifcord]/;
|
|
1415 #print "line recieved\n";
|
|
1416 if ($rank ==1){
|
|
1417 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
|
|
1418 }
|
|
1419 else {
|
|
1420 return "proceed" if thresholdCheck($line2) eq "proceed";
|
|
1421 return "stop";
|
|
1422 }
|
|
1423 }
|
|
1424 else{
|
|
1425 if ($rank ==1){
|
|
1426 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed";
|
|
1427 }
|
|
1428 else {
|
|
1429 return "proceed" if thresholdCheck($line2) eq "proceed";
|
|
1430 return "stop";
|
|
1431 }
|
|
1432 }
|
|
1433 return "stop";
|
|
1434 }
|
|
1435 #------------------------------------------------------------------------------------------------
|
|
1436
|
|
1437 sub thresholdCheck{
|
|
1438 my @checkthresholds=(0,@thresholds);
|
|
1439 #print "IN thresholdCheck: @_\n";
|
|
1440 my $line = $_[0];
|
|
1441 my @lields = split(/\t/,$line);
|
|
1442 return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])];
|
|
1443 return "stop";
|
|
1444 }
|
|
1445 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx
|
|
1446
|
|
1447
|
|
1448 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
|
|
1449
|
|
1450 sub multiSpecies_filtering_interrupted_microsats{
|
|
1451 # print "IN multiSpecies_filtering_interrupted_microsats: @_\n";
|
|
1452 my $unfiltered = $_[0];
|
|
1453 my $filtered = $_[1];
|
|
1454 my $residue = $_[2];
|
|
1455 my $no_of_species = $_[5];
|
|
1456 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
|
|
1457 open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
|
|
1458 open(RES,">$residue") or die "Cannot open file $residue: $!";
|
|
1459
|
|
1460 $infocord = 2 + (4*$no_of_species) - 1;
|
|
1461 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
1462 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
1463 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
1464 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
1465 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
1466
|
|
1467
|
|
1468 my @sub_thresholds = (0);
|
|
1469
|
|
1470 push(@sub_thresholds, split(/_/,$_[3]));
|
|
1471 my @thresholds = (0);
|
|
1472
|
|
1473 push(@thresholds, split(/_/,$_[4]));
|
|
1474
|
|
1475 while (my $line = <UNF>) {
|
|
1476 next if $line !~ /[a-z]/;
|
|
1477 #print $line;
|
|
1478 chomp $line;
|
|
1479 my @fields = split(/\t/,$line);
|
|
1480 my $motif = $fields[$motifcord];
|
|
1481 my $realmotif = $motif;
|
|
1482 #print "motif = $motif\n";
|
|
1483 if ($motif =~ /^\[/){
|
|
1484 $motif =~ s/^\[//g;
|
|
1485 my @motifs = split(/\]/,$motif);
|
|
1486 $realmotif = $motifs[0];
|
|
1487 }
|
|
1488 # print "realmotif = $realmotif";
|
|
1489 my $motif_size = length($realmotif);
|
|
1490
|
|
1491 my $microsat = $fields[$microsatcord];
|
|
1492 # print "microsat = $microsat\n";
|
|
1493 $microsat =~ s/^\[|\]$//sg;
|
|
1494 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
|
|
1495
|
|
1496 $microsat = join("",@microsats);
|
|
1497 if (length($microsat) < $thresholds[$motif_size]) {
|
|
1498 # print length($microsat)," < ",$thresholds[$motif_size],"\n";
|
|
1499 print RES $line,"\n"; next;
|
|
1500 }
|
|
1501 my @lengths = ();
|
|
1502 foreach my $mic (@microsats){
|
|
1503 push(@lengths, length($mic));
|
|
1504 }
|
|
1505 if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) {
|
|
1506 # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n";
|
|
1507 print RES $line,"\n"; next;}
|
|
1508 else {print FIL $line,"\n"; next;
|
|
1509 }
|
|
1510 }
|
|
1511 close FIL;
|
|
1512 close RES;
|
|
1513
|
|
1514 }
|
|
1515
|
|
1516 sub largest_microsat{
|
|
1517 my $counter = 0;
|
|
1518 my($max) = shift(@_);
|
|
1519 foreach my $temp (@_) {
|
|
1520 #print "finding largest array: $maxcounter \n";
|
|
1521 if($temp > $max){
|
|
1522 $max = $temp;
|
|
1523 }
|
|
1524 }
|
|
1525 return($max);
|
|
1526 }
|
|
1527
|
|
1528 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx
|
|
1529
|
|
1530
|
|
1531 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
|
|
1532 sub multiSpecies_compound_microsat_analyzer{
|
|
1533 ####### PARAMETER ########
|
|
1534 ##########################
|
|
1535
|
|
1536 my $input1 = $_[0]; ###### the *_sput_op4_ii file
|
|
1537 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
|
|
1538 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
|
|
1539 my $output2 = $_[3]; ###### the pure compound microsatellites
|
|
1540 my $org = $_[4];
|
|
1541 my $no_of_species = $_[5];
|
|
1542 # print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n";
|
|
1543 $infocord = 2 + (4*$no_of_species) - 1;
|
|
1544 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
1545 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
1546 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
1547 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
1548 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
1549 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
1550
|
|
1551 open(IN,"<$input1") or die "Cannot open file $input1 $!";
|
|
1552 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
|
|
1553
|
|
1554 open(OUT,">$output1") or die "Cannot open file $output1 $!";
|
|
1555 open(OUT2,">$output2") or die "Cannot open file $output2 $!";
|
|
1556
|
|
1557
|
|
1558 # print "opened files \n";
|
|
1559 my %micros = ();
|
|
1560 my $keycounter=0;
|
|
1561 my $linecounter=0;
|
|
1562 while (my $line = <IN>){
|
|
1563 $linecounter++;
|
|
1564 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
1565 my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
|
|
1566 push (@{$micros{$key}},$line);
|
|
1567 $keycounter++;
|
|
1568 }
|
|
1569 else{
|
|
1570 # print "no key\n";
|
|
1571 }
|
|
1572 }
|
|
1573 close IN;
|
|
1574 my @deletedlines = ();
|
|
1575 # print "done hash . linecounter=$linecounter, keycounter=$keycounter\n";
|
|
1576 #---------------------------------------------------------------------------------------------------
|
|
1577 # NOW READING THE SEQUENCE FILE
|
|
1578 my $keyfound=0;
|
|
1579 my $keyexists=0;
|
|
1580 my $inter=0;
|
|
1581 my $pure=0;
|
|
1582
|
|
1583 while(my $sine = <SEQ>){
|
|
1584 my %microstart=();
|
|
1585 my %microend=();
|
|
1586 my @sields = split(/\t/,$sine);
|
|
1587 my $key = 0;
|
|
1588 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
1589 $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
|
|
1590 # print $sine;
|
|
1591 # print $key;
|
|
1592 $keyfound++;
|
|
1593 }
|
|
1594 else{
|
|
1595
|
|
1596 }
|
|
1597 #<STDIN> if !defined $key;
|
|
1598
|
|
1599 if (exists $micros{$key}){
|
|
1600 $keyexists++;
|
|
1601 my @microstring = @{$micros{$key}};
|
|
1602
|
|
1603 my @filteredmicrostring;
|
|
1604
|
|
1605 foreach my $line (@microstring){
|
|
1606 chomp $line;
|
|
1607 my $copy_line = $line;
|
|
1608 my @fields = split(/\t/,$line);
|
|
1609 my $start = $fields[$startcord];
|
|
1610 my $end = $fields[$endcord];
|
|
1611 # FOR COMPOUND MICROSATELLITES
|
|
1612 if ($fields[$typecord] eq "compound"){
|
|
1613 $line = compound_microsat_analyser($line);
|
|
1614 if ($line eq "NULL") {
|
|
1615 print OUT2 "$copy_line\n";
|
|
1616 $pure++;
|
|
1617 next;
|
|
1618 }
|
|
1619 else{
|
|
1620 print OUT "$line\n";
|
|
1621 $inter++;
|
|
1622 next;
|
|
1623 }
|
|
1624 }
|
|
1625 }
|
|
1626
|
|
1627 } #if (exists $micros{$key}){
|
|
1628 }
|
|
1629 close OUT;
|
|
1630 close OUT2;
|
|
1631 # print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n";
|
|
1632 }
|
|
1633
|
|
1634 sub compound_microsat_analyser{
|
|
1635 my $line = $_[0];
|
|
1636 my @fields = split(/\t/,$line);
|
|
1637 my $motifline = $fields[$motifcord];
|
|
1638 my $microsat = $fields[$microsatcord];
|
|
1639 $motifline =~ s/^\[|\]$//g;
|
|
1640 $microsat =~ s/^\[|\]$//g;
|
|
1641 $microsat =~ s/-//g;
|
|
1642 my @interruptions = ();
|
|
1643 my @motields = split(/\]\[/,$motifline);
|
|
1644 my @microields = split(/\][a-zA-Z|-]*\[/,$microsat);
|
|
1645 my @inields = split(/[.*]/,$microsat);
|
|
1646 shift @inields;
|
|
1647 my @motifcount = scalar(@motields);
|
|
1648 my $prevmotif = $motields[0];
|
|
1649 my $prevmicro = $microields[0];
|
|
1650 my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0]));
|
|
1651 my $localflag = 'down';
|
|
1652 my @infoarray = ();
|
|
1653
|
|
1654 for my $l (1 ... (scalar(@motields)-1)){
|
|
1655 my $probe = $prevmotif.$prevmotif;
|
|
1656 if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;}
|
|
1657
|
|
1658 if ($probe =~ /$motields[$l]/i){
|
|
1659 my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l]));
|
|
1660 my $curr_startphase = substr($microields[$l],0,length($motields[$l]));
|
|
1661 if ($curr_startphase =~ /$prevphase/i) {
|
|
1662 $infoarray[$l-1] = "insertion";
|
|
1663 }
|
|
1664 else {
|
|
1665 $infoarray[$l-1] = "indel/substitution";
|
|
1666 }
|
|
1667
|
|
1668 $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase;
|
|
1669 next;
|
|
1670 }
|
|
1671 else {$localflag = "up"; last;}
|
|
1672 }
|
|
1673 if ($localflag eq 'up') {return "NULL";}
|
|
1674
|
|
1675 if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";}
|
|
1676 if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";}
|
|
1677 if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";}
|
|
1678 if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";}
|
|
1679 if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";}
|
|
1680
|
|
1681 @microields = split(/[\[|\]]/,$microsat);
|
|
1682 my @microsats = ();
|
|
1683 my @positions = ();
|
|
1684 my $lengthtracker = 0;
|
|
1685
|
|
1686 for my $i (0 ... (scalar(@microields ) - 1)){
|
|
1687 if ($i%2 == 0){
|
|
1688 push(@microsats,$microields[$i]);
|
|
1689 $lengthtracker = $lengthtracker + length($microields[$i]);
|
|
1690
|
|
1691 }
|
|
1692 else{
|
|
1693 push(@interruptions,$microields[$i]);
|
|
1694 push(@positions, $lengthtracker+1);
|
|
1695 $lengthtracker = $lengthtracker + length($microields[$i]);
|
|
1696 }
|
|
1697
|
|
1698 }
|
|
1699 my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions)));
|
|
1700 return($returnline);
|
|
1701 }
|
|
1702
|
|
1703 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx
|
|
1704
|
|
1705
|
|
1706 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
|
|
1707
|
|
1708 sub multiSpecies_compoundClarifyer{
|
|
1709 # print "IN multiSpecies_compoundClarifyer: @_\n";
|
|
1710 my $input1 = $_[0]; ###### the *_sput_compound
|
|
1711 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
|
|
1712 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
|
|
1713 my $output2 = $_[3]; ###### compound file
|
|
1714 my $org = $_[4];
|
|
1715 my $no_of_species = $_[5];
|
|
1716 @thresholds = "0";
|
|
1717 push(@thresholds, split(/_/,$_[6]));
|
|
1718
|
|
1719
|
|
1720 $infocord = 2 + (4*$no_of_species) - 1;
|
|
1721 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
1722 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
1723 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
1724 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
1725 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
1726 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
1727 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
1728
|
|
1729 $interr_poscord = $motifcord + 3;
|
|
1730 $no_of_interruptionscord = $motifcord + 4;
|
|
1731 $interrcord = $motifcord + 2;
|
|
1732 $interrtypecord = $motifcord + 1;
|
|
1733
|
|
1734
|
|
1735 open(IN,"<$input1") or die "Cannot open file $input1 $!";
|
|
1736 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
|
|
1737
|
|
1738 open(INT,">$output1") or die "Cannot open file $output2 $!";
|
|
1739 open(COMP,">$output2") or die "Cannot open file $output2 $!";
|
|
1740 #open(CH,">changed") or die "Cannot open file changed $!";
|
|
1741
|
|
1742 # print "opened files \n";
|
|
1743 my $linecounter = 0;
|
|
1744 my $microcounter = 0;
|
|
1745
|
|
1746 my %micros = ();
|
|
1747 while (my $line = <IN>){
|
|
1748 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
1749 $linecounter++;
|
|
1750 if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
|
|
1751 my $key = join("\t",$1, $2, $3, $4);
|
|
1752 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
1753 # print "key = $key\n";
|
|
1754 push (@{$micros{$key}},$line);
|
|
1755 $microcounter++;
|
|
1756 }
|
|
1757 else {#print $line," key not made\n"; <STDIN>;
|
|
1758 }
|
|
1759 }
|
|
1760 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
1761 close IN;
|
|
1762 my @deletedlines = ();
|
|
1763 # print "done hash \n";
|
|
1764 $linecounter = 0;
|
|
1765 #---------------------------------------------------------------------------------------------------
|
|
1766 # NOW READING THE SEQUENCE FILE
|
|
1767 my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide);
|
|
1768 $printer = 0;
|
|
1769
|
|
1770 while(my $sine = <SEQ>){
|
|
1771 my %microstart=();
|
|
1772 my %microend=();
|
|
1773 my @sields = split(/\t/,$sine);
|
|
1774 my $key = ();
|
|
1775
|
|
1776 # print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>;
|
|
1777
|
|
1778 if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) {
|
|
1779
|
|
1780 # if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) {
|
|
1781 $key = join("\t",$1, $2, $3, $4);
|
|
1782 # print "key = $key\n";
|
|
1783 }
|
|
1784 else{
|
|
1785 # print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n";
|
|
1786 }
|
|
1787
|
|
1788 if (exists $micros{$key}){
|
|
1789 my @microstring = @{$micros{$key}};
|
|
1790 delete $micros{$key};
|
|
1791
|
|
1792 foreach my $line (@microstring){
|
|
1793 # print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1;
|
|
1794 # print "microsat = $line" if $printer == 1;
|
|
1795 $linecounter++;
|
|
1796 my $copy_line = $line;
|
|
1797 my @mields = split(/\t/,$line);
|
|
1798 my @fields = @mields;
|
|
1799 my $start = $fields[$startcord];
|
|
1800 my $end = $fields[$endcord];
|
|
1801 my $microsat = $fields[$microsatcord];
|
|
1802 my $motifline = $fields[$motifcord];
|
|
1803 my $microsatcopy = $microsat;
|
|
1804 my $positioner = $microsat;
|
|
1805 $positioner =~ s/[a-zA-Z|-]/_/g;
|
|
1806 $microsatcopy =~ s/^\[|\]$//gs;
|
|
1807 chomp $microsatcopy;
|
|
1808 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
|
|
1809 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
|
|
1810 my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start);
|
|
1811 # print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1;
|
|
1812 shift @inields;
|
|
1813 #print "inields =@inields<\n";
|
|
1814 $motifline =~ s/^\[|\]$//gs;
|
|
1815 chomp $motifline;
|
|
1816 #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n";
|
|
1817 my @motields = split(/\]\[/,$motifline);
|
|
1818 my $seq = $microsatcopy;
|
|
1819 $seq =~ s/\[|\]//g;
|
|
1820 my $seqlen = length($seq);
|
|
1821 $seq = " ".$seq;
|
|
1822
|
|
1823 my $longestmotif_no = longest_array_element(@motields);
|
|
1824 my $shortestmotif_no = shortest_array_element(@motields);
|
|
1825 #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n";
|
|
1826
|
|
1827 my $search = $motields[$longestmotif_no].$motields[$longestmotif_no];
|
|
1828 if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){
|
|
1829 print COMP $line;
|
|
1830 next;
|
|
1831 }
|
|
1832
|
|
1833 my @shortestmotif_nos = ();
|
|
1834 for my $m (0 ... $#motields){
|
|
1835 push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) );
|
|
1836 }
|
|
1837 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
|
|
1838 my $newleft =();
|
|
1839 my $leftstopper = 0; my $rightstopper = 0;
|
|
1840 foreach my $shortmotif_no (@shortestmotif_nos){
|
|
1841 next if $shortmotif_no == 0;
|
|
1842 my $last_left = $shortmotif_no; #$#motields;
|
|
1843 my $last_hitter = 0;
|
|
1844 for (my $i =($shortmotif_no-1); $i>=0; $i--){
|
|
1845 my $search = $motields[$shortmotif_no];
|
|
1846 if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
|
|
1847 if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
|
|
1848 $last_hitter = 1;
|
|
1849 $last_left = $i+1; last;
|
|
1850 }
|
|
1851 my $probe = $motields[$i];
|
|
1852 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
|
|
1853
|
|
1854 if ($probe !~ /$search/){
|
|
1855 $last_hitter = 1;
|
|
1856 $last_left = $i+1;
|
|
1857 # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n";
|
|
1858 last;
|
|
1859 }
|
|
1860 $last_left--;$last_hitter = 1;
|
|
1861 # print "passed tests, last left = $last_left\n";
|
|
1862 }
|
|
1863 # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n";
|
|
1864 if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;}
|
|
1865 else {$leftstopper = 1;
|
|
1866 # print "leftstopper = 1\n";
|
|
1867 }
|
|
1868 }
|
|
1869
|
|
1870 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------
|
|
1871 my $newright =();
|
|
1872 foreach my $shortmotif_no (@shortestmotif_nos){
|
|
1873 next if $shortmotif_no == $#motields;
|
|
1874 my $last_right = $shortmotif_no;# -1;
|
|
1875 for my $i ($shortmotif_no+1 ... $#motields){
|
|
1876 my $search = $motields[$shortmotif_no];
|
|
1877 if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;}
|
|
1878 if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){
|
|
1879 $last_right = $i-1; last;
|
|
1880 }
|
|
1881 my $probe = $motields[$i];
|
|
1882 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];}
|
|
1883 if ( $probe !~ /$search/){
|
|
1884 $last_right = $i-1; last;
|
|
1885 }
|
|
1886 $last_right++;
|
|
1887 }
|
|
1888 if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n";
|
|
1889 }
|
|
1890 else {$rightstopper = 1;
|
|
1891 }
|
|
1892 }
|
|
1893
|
|
1894
|
|
1895 if ($rightstopper == 1 && $leftstopper == 1){
|
|
1896 print COMP $line;
|
|
1897 # print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1;
|
|
1898 next;
|
|
1899 }
|
|
1900
|
|
1901 # print "pased initial testing phase \n" if $printer == 1;
|
|
1902 my @outputs = ();
|
|
1903 my @orig_starts = ();
|
|
1904 my @orig_ends = ();
|
|
1905 for my $mic (0 ... $#microields){
|
|
1906 my $miclen = length($microields[$mic]);
|
|
1907 my $microleftlen = 0;
|
|
1908 #print "\nmic = $mic\n";
|
|
1909 if($mic > 0){
|
|
1910 for my $submin (0 ... $mic-1){
|
|
1911 my $interval = ();
|
|
1912 if (!exists $inields[$submin]) {$interval = "";}
|
|
1913 else {$interval = $inields[$submin];}
|
|
1914 #print "inield =$interval< and microield =$microields[$submin]<\n ";
|
|
1915 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
|
|
1916 }
|
|
1917 }
|
|
1918 push(@orig_starts,($microleftlen+1));
|
|
1919 push(@orig_ends, ($microleftlen+1 + $miclen -1));
|
|
1920 }
|
|
1921
|
|
1922 ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
|
|
1923
|
|
1924
|
|
1925 for my $mic (0 ... $#microields){
|
|
1926 my $miclen = length($microields[$mic]);
|
|
1927 my $microleftlen = 0;
|
|
1928 if($mic > 0){
|
|
1929 for my $submin (0 ... $mic-1){
|
|
1930 # if(!exists $inields[$submin]) {$inields[$submin] = "";}
|
|
1931 my $interval = ();
|
|
1932 if (!exists $inields[$submin]) {$interval = "";}
|
|
1933 else {$interval = $inields[$submin];}
|
|
1934 #print "inield =$interval< and microield =$microields[$submin]<\n ";
|
|
1935 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval);
|
|
1936 }
|
|
1937 }
|
|
1938 $fields[$startcord] = $microleftlen+1;
|
|
1939 $fields[$endcord] = $fields[$startcord] + $miclen -1;
|
|
1940 $fields[$typecord] = $microsat_types[length($motields[$mic])];
|
|
1941 $fields[$microsatcord] = $microields[$mic];
|
|
1942 $fields[$motifcord] = $motields[$mic];
|
|
1943 my $templine = join("\t", (@fields[0 .. $motifcord]) );
|
|
1944 my $orig_templine = join("\t", (@fields[0 .. $motifcord]) );
|
|
1945 my $newline;
|
|
1946 my $lefter = 1; my $righter = 1;
|
|
1947 if ( $fields[$startcord] < 2){$lefter = 0;}
|
|
1948 if ($fields[$endcord] == $seqlen){$righter = 0;}
|
|
1949
|
|
1950 while($lefter == 1){
|
|
1951 $newline = left_extender($templine, $seq,$org);
|
|
1952 # print "returned line from left extender= $newline \n" if $printer == 1;
|
|
1953 if ($newline eq $templine){$templine = $newline; last;}
|
|
1954 else {$templine = $newline;}
|
|
1955
|
|
1956 if (left_extention_permission_giver($templine) eq "no") {last;}
|
|
1957 }
|
|
1958 while($righter == 1){
|
|
1959 $newline = right_extender($templine, $seq,$org);
|
|
1960 # print "returned line from right extender= $newline \n" if $printer == 1;
|
|
1961 if ($newline eq $templine){$templine = $newline; last;}
|
|
1962 else {$templine = $newline;}
|
|
1963 if (right_extention_permission_giver($templine) eq "no") {last;}
|
|
1964 }
|
|
1965 my @tempfields = split(/\t/,$templine);
|
|
1966 $tempfields[$microsatcord] =~ s/\]|\[//g;
|
|
1967 $tempfields[$motifcord] =~ s/^\[|\]$//gs;
|
|
1968 my @tempmotields = split(/\]\[/,$tempfields[$motifcord]);
|
|
1969
|
|
1970 if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) {
|
|
1971 # print "scalar ( tempmotields) = 1\n" if $printer == 1;
|
|
1972 next;
|
|
1973 }
|
|
1974 my $prevmotif = shift(@tempmotields);
|
|
1975 my $stopper = 0;
|
|
1976
|
|
1977 foreach my $tempmot (@tempmotields){
|
|
1978 if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;}
|
|
1979 my $search = $prevmotif.$prevmotif;
|
|
1980 if ($search !~ /$tempmot/) {$stopper = 1; last;}
|
|
1981 $prevmotif = $tempmot;
|
|
1982 }
|
|
1983 if ( $stopper == 1) {
|
|
1984 # print "length tempmot != length prevmotif\n" if $printer == 1;
|
|
1985 next;
|
|
1986 }
|
|
1987 my $lastend = 0;
|
|
1988 #----------------------------------------------------------
|
|
1989 my $left_captured = (); my $right_captured = ();
|
|
1990 my $left_bp = (); my $right_bp = ();
|
|
1991 # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n";
|
|
1992 for my $o (0 ... $#orig_starts){
|
|
1993 # print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1;
|
|
1994 # print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1;
|
|
1995
|
|
1996 if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord])
|
|
1997 # print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1;
|
|
1998 $left_captured = $o;
|
|
1999 $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1;
|
|
2000 }
|
|
2001 elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord])
|
|
2002 # print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1;
|
|
2003 $right_captured = $o;
|
|
2004 $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1;
|
|
2005 }
|
|
2006 $lastend = $orig_ends[$o]
|
|
2007 }
|
|
2008 # print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1;
|
|
2009 my $leftmotif = (); my $left_trashed = ();
|
|
2010 if ($tempfields[$startcord] != $fields[$startcord]) {
|
|
2011 $leftmotif = $motields[$left_captured];
|
|
2012 # print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1;
|
|
2013 if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n";
|
|
2014 }
|
|
2015 $left_trashed = length($microields[$left_captured]) - $left_bp;
|
|
2016 }
|
|
2017 my $rightmotif = (); my $right_trashed = ();
|
|
2018 if ($tempfields[$endcord] != $fields[$endcord]) {
|
|
2019 # print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1;
|
|
2020 $rightmotif = $motields[$right_captured];
|
|
2021 $right_trashed = length($microields[$right_captured]) - $right_bp;
|
|
2022 }
|
|
2023
|
|
2024 ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@
|
|
2025 $stopper = 0;
|
|
2026 my $deletioner = 0;
|
|
2027 #if($tempfields[$startcord] != $fields[$startcord]){
|
|
2028 # print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1;
|
|
2029 if ($left_captured != 0){
|
|
2030 # print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1;
|
|
2031 for my $e (0 ... $left_captured-1){
|
|
2032 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){
|
|
2033 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2034 $deletioner++; last;
|
|
2035 }
|
|
2036 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
|
|
2037 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2038 $deletioner++; last;
|
|
2039 }
|
|
2040 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
|
|
2041 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2042 $deletioner++; last;
|
|
2043 }
|
|
2044 }
|
|
2045 }
|
|
2046 #}
|
|
2047 # print "after left search, deletioner = $deletioner\n" if $printer == 1;
|
|
2048 if ($deletioner >= 1) {
|
|
2049 # print "deletioner = $deletioner\n" if $printer == 1;
|
|
2050 next;
|
|
2051 }
|
|
2052
|
|
2053 $deletioner = 0;
|
|
2054
|
|
2055 #if($tempfields[$endcord] != $fields[$endcord]){
|
|
2056 # print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1;
|
|
2057 if ($right_captured != $#microields){
|
|
2058 # print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1;
|
|
2059 for my $e ($right_captured+1 ... $#microields){
|
|
2060 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){
|
|
2061 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2062 $deletioner++; last;
|
|
2063 }
|
|
2064 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){
|
|
2065 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2066 $deletioner++; last;
|
|
2067 }
|
|
2068 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){
|
|
2069 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1;
|
|
2070 $deletioner++; last;
|
|
2071 }
|
|
2072 }
|
|
2073 }
|
|
2074 #}
|
|
2075 # print "deletioner = $deletioner\n" if $printer == 1;
|
|
2076 if ($deletioner >= 1) {
|
|
2077 next;
|
|
2078 }
|
|
2079 my $leftMotifs_notCaptured = ();
|
|
2080 my $rightMotifs_notCaptured = ();
|
|
2081
|
|
2082 if ($tempfields[$startcord] != $fields[$startcord] ){
|
|
2083 #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n";
|
|
2084 if (length($leftmotif) == 1 && $left_trashed > 3){
|
|
2085 # print "invaded left motif is long mononucleotide" if $printer == 1;
|
|
2086 next;
|
|
2087
|
|
2088 }
|
|
2089 elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){
|
|
2090 # print "invaded left motif too long" if $printer == 1;
|
|
2091 next;
|
|
2092 }
|
|
2093 }
|
|
2094 if ($tempfields[$endcord] != $fields[$endcord] ){
|
|
2095 #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n";
|
|
2096 if (length($rightmotif)==1 && $right_trashed){
|
|
2097 # print "invaded right motif is long mononucleotide" if $printer == 1;
|
|
2098 next;
|
|
2099
|
|
2100 }
|
|
2101 elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){
|
|
2102 # print "invaded right motif too long" if $printer == 1;
|
|
2103 next;
|
|
2104
|
|
2105 }
|
|
2106 }
|
|
2107 push @outputs, $templine;
|
|
2108 }
|
|
2109 if (scalar(@outputs) == 0){ print COMP $line; next;}
|
|
2110 # print "outputs are:", join("\n",@outputs),"\n";
|
|
2111 if (scalar(@outputs) == 1){
|
|
2112 my @oields = split(/\t/,$outputs[0]);
|
|
2113 my $start = $oields[$startcord]+$mields[$startcord]-1;
|
|
2114 my $end = $start+($oields[$endcord]-$oields[$startcord]);
|
|
2115 $oields[$startcord] = $start; $oields[$endcord] = $end;
|
|
2116 print INT join("\t",@oields), "\n";
|
|
2117 # print CH $line,;
|
|
2118 }
|
|
2119 if (scalar(@outputs) > 1){
|
|
2120 my $motif_min = 10;
|
|
2121 my $chosen_one = $outputs[0];
|
|
2122 foreach my $micro (@outputs){
|
|
2123 my @oields = split(/\t/,$micro);
|
|
2124 my $tempmotif = $oields[$motifcord];
|
|
2125 $tempmotif =~ s/^\[|\]$//gs;
|
|
2126 my @omots = split(/\]\[/, $tempmotif);
|
|
2127 # print "motif_min = $motif_min, current motif = $tempmotif\n";
|
|
2128 my $start = $oields[$startcord]+$mields[$startcord]-1;
|
|
2129 my $end = $start+($oields[$endcord]-$oields[$startcord]);
|
|
2130 $oields[$startcord] = $start; $oields[$endcord] = $end;
|
|
2131 if(length($omots[0]) < $motif_min) {
|
|
2132 $chosen_one = join("\t",@oields);
|
|
2133 $motif_min = length($omots[0]);
|
|
2134 }
|
|
2135 }
|
|
2136 print INT $chosen_one, "\n";
|
|
2137 # print "chosen one is ".$chosen_one, "\n";
|
|
2138 # print CH $line;
|
|
2139
|
|
2140
|
|
2141 }
|
|
2142
|
|
2143 }
|
|
2144
|
|
2145 } #if (exists $micros{$key}){
|
|
2146 else{
|
|
2147 }
|
|
2148 }
|
|
2149 close INT;
|
|
2150 close COMP;
|
|
2151 }
|
|
2152 sub left_extender{
|
|
2153 #print "left extender\n";
|
|
2154 my ($line, $seq, $org) = @_;
|
|
2155 # print "in left extender... line passed = $line and sequence is $seq\n";
|
|
2156 chomp $line;
|
|
2157 my @fields = split(/\t/,$line);
|
|
2158 my $rstart = $fields[$startcord];
|
|
2159 my $microsat = $fields[$microsatcord];
|
|
2160 $microsat =~ s/\[|\]//g;
|
|
2161 my $rend = $rstart + length($microsat)-1;
|
|
2162 $microsat =~ s/-//g;
|
|
2163 my $motif = $fields[$motifcord];
|
|
2164 my $firstmotif = ();
|
|
2165
|
|
2166 if ($motif =~ /^\[/){
|
|
2167 $motif =~ s/^\[//g;
|
|
2168 $motif =~ /([a-zA-Z]+)\].*/;
|
|
2169 $firstmotif = $1;
|
|
2170 }
|
|
2171 else {$firstmotif = $motif;}
|
|
2172
|
|
2173 #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n";
|
|
2174 my $leftphase = substr($microsat, 0,length($firstmotif));
|
|
2175 my $phaser = $leftphase.$leftphase;
|
|
2176 my @phase = split(/\s*/,$leftphase);
|
|
2177 my @phases;
|
|
2178 my @copy_phases = @phases;
|
|
2179 my $crawler=0;
|
|
2180 for (0 ... (length($leftphase)-1)){
|
|
2181 push(@phases, substr($phaser, $crawler, length($leftphase)));
|
|
2182 $crawler++;
|
|
2183 }
|
|
2184
|
|
2185 my $start = $rstart;
|
|
2186 my $end = $rend;
|
|
2187
|
|
2188 my $leftseq = substr($seq, 0, $start);
|
|
2189 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n";
|
|
2190 my @extentions = ();
|
|
2191 my @trappeds = ();
|
|
2192 my @intervalposs = ();
|
|
2193 my @trappedposs = ();
|
|
2194 my @trappedphases = ();
|
|
2195 my @intervals = ();
|
|
2196 my $firstmotif_length = length($firstmotif);
|
|
2197 foreach my $phase (@phases){
|
|
2198 # print "left phase\t",substr($leftseq, -10),"\t$phase\n";
|
|
2199 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n";
|
|
2200 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
|
|
2201 # print "in left pattern\n";
|
|
2202 my $trapped = $1;
|
|
2203 my $trappedpos = length($leftseq)-length($trapped);
|
|
2204 my $interval = $3;
|
|
2205 my $intervalpos = index($trapped, $interval) + 1;
|
|
2206 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n";
|
|
2207
|
|
2208 my $extention = substr($trapped, 0, length($trapped)-length($interval));
|
|
2209 my $leftpeep = substr($seq, 0, ($start-length($trapped)));
|
|
2210 my @passed_overhangs;
|
|
2211
|
|
2212 for my $i (1 ... length($phase)-1){
|
|
2213 my $overhang = substr($phase, -length($phase)+$i);
|
|
2214 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n";
|
|
2215 #TEMPORARY... BETTER METHOD NEEDED
|
|
2216 $leftpeep =~ s/-//g;
|
|
2217 if ($leftpeep =~ /$overhang$/i){
|
|
2218 push(@passed_overhangs,$overhang);
|
|
2219 # print "l overhang\n";
|
|
2220 }
|
|
2221 }
|
|
2222
|
|
2223 if(scalar(@passed_overhangs)>0){
|
|
2224 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
|
|
2225 $extention = $overhang.$extention;
|
|
2226 $trapped = $overhang.$trapped;
|
|
2227 #print "trapped extended to $trapped \n";
|
|
2228 $trappedpos = length($leftseq)-length($trapped);
|
|
2229 }
|
|
2230
|
|
2231 push(@extentions,$extention);
|
|
2232 # print "extentions = @extentions \n";
|
|
2233
|
|
2234 push(@trappeds,$trapped );
|
|
2235 push(@intervalposs,length($extention)+1);
|
|
2236 push(@trappedposs, $trappedpos);
|
|
2237 # print "trappeds = @trappeds\n";
|
|
2238 push(@trappedphases, substr($extention,0,length($phase)));
|
|
2239 push(@intervals, $interval);
|
|
2240 }
|
|
2241 }
|
|
2242 if (scalar(@trappeds == 0)) {return $line;}
|
|
2243
|
|
2244 my $nikaal = shortest_array_element(@intervals);
|
|
2245
|
|
2246 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
|
|
2247 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
|
|
2248 ##print "new fields 9 = $fields[9]\n";
|
|
2249 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
|
|
2250
|
|
2251 if($fields[$microsatcord] !~ /^\[/i){
|
|
2252 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
|
|
2253 }
|
|
2254
|
|
2255 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
|
|
2256
|
|
2257 if (exists ($fields[$motifcord+1])){
|
|
2258 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
|
|
2259 }
|
|
2260 else{$fields[$motifcord+1] = "indel/deletion";}
|
|
2261 ##print "new fields 14 = $fields[14]\n";
|
|
2262
|
|
2263 if (exists ($fields[$motifcord+2])){
|
|
2264 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
|
|
2265 }
|
|
2266 else{$fields[$motifcord+2] = $intervals[$nikaal];}
|
|
2267 my @seventeen=();
|
|
2268 if (exists ($fields[$motifcord+3])){
|
|
2269 @seventeen = split(/,/,$fields[$motifcord+3]);
|
|
2270 # #print "scalarseventeen =@seventeen<-\n";
|
|
2271 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
|
|
2272 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
|
|
2273 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
|
|
2274 }
|
|
2275
|
|
2276 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
|
|
2277
|
|
2278 ##print "new fields 16 = $fields[16]\n";
|
|
2279 ##print "new fields 17 = $fields[17]\n";
|
|
2280
|
|
2281
|
|
2282 my $returnline = join("\t",@fields);
|
|
2283 my $pastline = $returnline;
|
|
2284 if ($fields[$microsatcord] =~ /\[/){
|
|
2285 $returnline = multiSpecies_compoundClarifyer_merge($returnline);
|
|
2286 }
|
|
2287 return $returnline;
|
|
2288 }
|
|
2289 sub right_extender{
|
|
2290 my ($line, $seq, $org) = @_;
|
|
2291 chomp $line;
|
|
2292 my @fields = split(/\t/,$line);
|
|
2293 my $rstart = $fields[$startcord];
|
|
2294 my $microsat = $fields[$microsatcord];
|
|
2295 $microsat =~ s/\[|\]//g;
|
|
2296 my $rend = $rstart + length($microsat)-1;
|
|
2297 $microsat =~ s/-//g;
|
|
2298 my $motif = $fields[$motifcord];
|
|
2299 my $temp_lastmotif = ();
|
|
2300
|
|
2301 if ($motif =~ /\]$/s){
|
|
2302 $motif =~ s/\]$//sg;
|
|
2303 $motif =~ /.*\[([a-zA-Z]+)/;
|
|
2304 $temp_lastmotif = $1;
|
|
2305 }
|
|
2306 else {$temp_lastmotif = $motif;}
|
|
2307 my $lastmotif = substr($microsat,-length($temp_lastmotif));
|
|
2308 ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n";
|
|
2309 my $rightphase = substr($microsat, -length($lastmotif));
|
|
2310 my $phaser = $rightphase.$rightphase;
|
|
2311 my @phase = split(/\s*/,$rightphase);
|
|
2312 my @phases;
|
|
2313 my @copy_phases = @phases;
|
|
2314 my $crawler=0;
|
|
2315 for (0 ... (length($rightphase)-1)){
|
|
2316 push(@phases, substr($phaser, $crawler, length($rightphase)));
|
|
2317 $crawler++;
|
|
2318 }
|
|
2319
|
|
2320 my $start = $rstart;
|
|
2321 my $end = $rend;
|
|
2322
|
|
2323 my $rightseq = substr($seq, $end+1);
|
|
2324 my @extentions = ();
|
|
2325 my @trappeds = ();
|
|
2326 my @intervalposs = ();
|
|
2327 my @trappedposs = ();
|
|
2328 my @trappedphases = ();
|
|
2329 my @intervals = ();
|
|
2330 my $lastmotif_length = length($lastmotif);
|
|
2331 foreach my $phase (@phases){
|
|
2332 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
|
|
2333 my $trapped = $1;
|
|
2334 my $trappedpos = $end+1;
|
|
2335 my $interval = $2;
|
|
2336 my $intervalpos = index($trapped, $interval) + 1;
|
|
2337
|
|
2338 my $extention = substr($trapped, length($interval));
|
|
2339 my $rightpeep = substr($seq, ($end+length($trapped))+1);
|
|
2340 my @passed_overhangs = "";
|
|
2341
|
|
2342 #TEMPORARY... BETTER METHOD NEEDED
|
|
2343 $rightpeep =~ s/-//g;
|
|
2344
|
|
2345 for my $i (1 ... length($phase)-1){
|
|
2346 my $overhang = substr($phase,0, $i);
|
|
2347 # #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n";
|
|
2348 if ($rightpeep =~ /^$overhang/i){
|
|
2349 push(@passed_overhangs, $overhang);
|
|
2350 # #print "r overhang\n";
|
|
2351 }
|
|
2352 }
|
|
2353 if (scalar(@passed_overhangs) > 0){
|
|
2354 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
|
|
2355 $extention = $extention.$overhang;
|
|
2356 $trapped = $trapped.$overhang;
|
|
2357 # #print "trapped extended to $trapped \n";
|
|
2358 }
|
|
2359
|
|
2360 push(@extentions,$extention);
|
|
2361 ##print "extentions = @extentions \n";
|
|
2362
|
|
2363 push(@trappeds,$trapped );
|
|
2364 push(@intervalposs,$intervalpos);
|
|
2365 push(@trappedposs, $trappedpos);
|
|
2366 # #print "trappeds = @trappeds\n";
|
|
2367 push(@trappedphases, substr($extention,0,length($phase)));
|
|
2368 push(@intervals, $interval);
|
|
2369 }
|
|
2370 }
|
|
2371 if (scalar(@trappeds == 0)) {return $line;}
|
|
2372
|
|
2373 # my $nikaal = longest_array_element(@trappeds);
|
|
2374 my $nikaal = shortest_array_element(@intervals);
|
|
2375
|
|
2376 # #print "longest element found = $nikaal \n";
|
|
2377
|
|
2378 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
|
|
2379 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
|
|
2380 ##print "new fields 9 = $fields[9]";
|
|
2381 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
|
|
2382
|
|
2383 ##print "new fields 11 = $fields[11]\n";
|
|
2384
|
|
2385 if($fields[$microsatcord] !~ /^\[/i){
|
|
2386 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
|
|
2387 }
|
|
2388
|
|
2389 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
|
|
2390 ##print "new fields 12 = $fields[12]\n";
|
|
2391
|
|
2392 ##print "scalar of fields = ",scalar(@fields),"\n";
|
|
2393 if (exists ($fields[$motifcord+1])){
|
|
2394 # print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1];
|
|
2395 # <STDIN> if !exists $fields[$motifcord+1];
|
|
2396 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
|
|
2397 }
|
|
2398 else{$fields[$motifcord+1] = "indel/deletion";}
|
|
2399 ##print "new fields 14 = $fields[14]\n";
|
|
2400
|
|
2401 if (exists ($fields[$motifcord+2])){
|
|
2402 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
|
|
2403 }
|
|
2404 else{$fields[$motifcord+2] = $intervals[$nikaal];}
|
|
2405 ##print "new fields 15 = $fields[15]\n";
|
|
2406
|
|
2407 my @seventeen=();
|
|
2408 if (exists ($fields[$motifcord+3])){
|
|
2409 ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n";
|
|
2410 # print " print fields = @fields\n " if !exists $fields[$motifcord+3];
|
|
2411 <STDIN> if !exists $fields[$motifcord+3];
|
|
2412 my $currpos = length($microsat)+$intervalposs[$nikaal];
|
|
2413 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
|
|
2414 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
|
|
2415
|
|
2416 }
|
|
2417
|
|
2418 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
|
|
2419
|
|
2420 ##print "new fields 16 = $fields[16]\n";
|
|
2421
|
|
2422 ##print "new fields 17 = $fields[17]\n";
|
|
2423 my $returnline = join("\t",@fields);
|
|
2424 my $pastline = $returnline;
|
|
2425 if ($fields[$microsatcord] =~ /\[/){
|
|
2426 $returnline = multiSpecies_compoundClarifyer_merge($returnline);
|
|
2427 }
|
|
2428 #print "finally right-extended line = ",$returnline,"\n";
|
|
2429 return $returnline;
|
|
2430 }
|
|
2431 sub longest_array_element{
|
|
2432 my $counter = 0;
|
|
2433 my($max) = shift(@_);
|
|
2434 my $maxcounter = 0;
|
|
2435 foreach my $temp (@_) {
|
|
2436 $counter++;
|
|
2437 #print "finding largest array: $maxcounter \n" if $prinkter == 1;
|
|
2438 if(length($temp) > length($max)){
|
|
2439 $max = $temp;
|
|
2440 $maxcounter = $counter;
|
|
2441 }
|
|
2442 }
|
|
2443 return($maxcounter);
|
|
2444 }
|
|
2445 sub shortest_array_element{
|
|
2446 my $counter = 0;
|
|
2447 my($min) = shift(@_);
|
|
2448 my $mincounter = 0;
|
|
2449 foreach my $temp (@_) {
|
|
2450 $counter++;
|
|
2451 #print "finding largest array: $mincounter \n" if $prinkter == 1;
|
|
2452 if(length($temp) < length($min)){
|
|
2453 $min = $temp;
|
|
2454 $mincounter = $counter;
|
|
2455 }
|
|
2456 }
|
|
2457 return($mincounter);
|
|
2458 }
|
|
2459
|
|
2460
|
|
2461 sub left_extention_permission_giver{
|
|
2462 my @fields = split(/\t/,$_[0]);
|
|
2463 my $microsat = $fields[$microsatcord];
|
|
2464 $microsat =~ s/(^\[)|-//g;
|
|
2465 my $motif = $fields[$motifcord];
|
|
2466 my $firstmotif = ();
|
|
2467 my $firststretch = ();
|
|
2468 my @stretches=();
|
|
2469 if ($motif =~ /^\[/){
|
|
2470 $motif =~ s/^\[//g;
|
|
2471 $motif =~ /([a-zA-Z]+)\].*/;
|
|
2472 $firstmotif = $1;
|
|
2473 @stretches = split(/\]/,$microsat);
|
|
2474 $firststretch = $stretches[0];
|
|
2475 ##print "firststretch = $firststretch\n";
|
|
2476 }
|
|
2477 else {$firstmotif = $motif;$firststretch = $microsat;}
|
|
2478
|
|
2479 if (length($firststretch) < $thresholds[length($firstmotif)]){
|
|
2480 return "no";
|
|
2481 }
|
|
2482 else {return "yes";}
|
|
2483
|
|
2484 }
|
|
2485 sub right_extention_permission_giver{
|
|
2486 my @fields = split(/\t/,$_[0]);
|
|
2487 my $microsat = $fields[$microsatcord];
|
|
2488 $microsat =~ s/-|(\]$)//sg;
|
|
2489 my $motif = $fields[$motifcord];
|
|
2490 my $temp_lastmotif = ();
|
|
2491 my $laststretch = ();
|
|
2492 my @stretches=();
|
|
2493
|
|
2494
|
|
2495 if ($motif =~ /\]/){
|
|
2496 $motif =~ s/\]$//gs;
|
|
2497 $motif =~ /.*\[([a-zA-Z]+)$/;
|
|
2498 $temp_lastmotif = $1;
|
|
2499 @stretches = split(/\[/,$microsat);
|
|
2500 $laststretch = pop(@stretches);
|
|
2501 ##print "last stretch = $laststretch\n";
|
|
2502 }
|
|
2503 else {$temp_lastmotif = $motif; $laststretch = $microsat;}
|
|
2504
|
|
2505 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
|
|
2506 return "no";
|
|
2507 }
|
|
2508 else { return "yes";}
|
|
2509
|
|
2510
|
|
2511 }
|
|
2512 sub multiSpecies_compoundClarifyer_merge{
|
|
2513 my $line = $_[0];
|
|
2514 #print "sent for mering: $line \n";
|
|
2515 my @mields = split(/\t/,$line);
|
|
2516 my @fields = @mields;
|
|
2517 my $microsat = $fields[$microsatcord];
|
|
2518 my $motifline = $fields[$motifcord];
|
|
2519 my $microsatcopy = $microsat;
|
|
2520 $microsatcopy =~ s/^\[|\]$//sg;
|
|
2521 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
|
|
2522 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
|
|
2523 shift @inields;
|
|
2524 #print "inields =@inields<\n";
|
|
2525 $motifline =~ s/^\[|\]$//sg;
|
|
2526 my @motields = split(/\]\[/,$motifline);
|
|
2527 my @firstmotifs = ();
|
|
2528 my @lastmotifs = ();
|
|
2529 for my $i (0 ... $#microields){
|
|
2530 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
|
|
2531 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
|
|
2532 }
|
|
2533 #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n";
|
|
2534 my @mergelist = ();
|
|
2535 my @inter_poses = split(/,/,$fields[$interr_poscord]);
|
|
2536 my $no_of_interruptions = $fields[$no_of_interruptionscord];
|
|
2537 my @interruptions = split(/,/,$fields[$interrcord]);
|
|
2538 my @interrtypes = split(/,/,$fields[$interrtypecord]);
|
|
2539 my $stopper = 0;
|
|
2540 for my $i (0 ... $#motields-1){
|
|
2541 #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n";
|
|
2542 if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){
|
|
2543 $stopper = 1;
|
|
2544 push(@mergelist, ($i)."_".($i+1));
|
|
2545 }
|
|
2546 }
|
|
2547
|
|
2548 return $line if scalar(@mergelist) == 0;
|
|
2549
|
|
2550 foreach my $merging (@mergelist){
|
|
2551 my @sets = split(/_/, $merging);
|
|
2552 my @tempmicro = ();
|
|
2553 my @tempmot = ();
|
|
2554 for my $i (0 ... $sets[0]-1){
|
|
2555 push(@tempmicro, "[".$microields[$i]."]");
|
|
2556 push(@tempmicro, $inields[$i]);
|
|
2557 push(@tempmot, "[".$motields[$i]."]");
|
|
2558 #print "adding pre-motifs number $i\n";
|
|
2559 }
|
|
2560 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
|
|
2561 push (@tempmicro, $pusher);
|
|
2562 push(@tempmot, "[".$motields[$sets[0]]."]");
|
|
2563 my $outcoming = -2;
|
|
2564 for my $i ($sets[1]+1 ... $#microields-1){
|
|
2565 push(@tempmicro, "[".$microields[$i]."]");
|
|
2566 push(@tempmicro, $inields[$i]);
|
|
2567 push(@tempmot, "[".$motields[$i]."]");
|
|
2568 #print "adding post-motifs number $i\n";
|
|
2569 $outcoming = $i;
|
|
2570 }
|
|
2571 if ($outcoming != -2){
|
|
2572 #print "outcoming = $outcoming \n";
|
|
2573 push(@tempmicro, "[".$microields[$outcoming+1 ]."]");
|
|
2574 push(@tempmot,"[". $motields[$outcoming+1]."]");
|
|
2575 }
|
|
2576 $fields[$microsatcord] = join("",@tempmicro);
|
|
2577 $fields[$motifcord] = join("",@tempmot);
|
|
2578
|
|
2579 splice(@interrtypes, $sets[0], 1);
|
|
2580 $fields[$interrtypecord] = join(",",@interrtypes);
|
|
2581 splice(@interruptions, $sets[0], 1);
|
|
2582 $fields[$interrcord] = join(",",@interruptions);
|
|
2583 splice(@inter_poses, $sets[0], 1);
|
|
2584 $fields[$interr_poscord] = join(",",@inter_poses);
|
|
2585 $no_of_interruptions = $no_of_interruptions - 1;
|
|
2586 }
|
|
2587
|
|
2588 if ($no_of_interruptions == 0){
|
|
2589 $fields[$microsatcord] =~ s/^\[|\]$//sg;
|
|
2590 $fields[$motifcord] =~ s/^\[|\]$//sg;
|
|
2591 $line = join("\t", @fields[0 ... $motifcord]);
|
|
2592 }
|
|
2593 else{
|
|
2594 $line = join("\t", @fields);
|
|
2595 }
|
|
2596 return $line;
|
|
2597 }
|
|
2598
|
|
2599 sub thrashallow{
|
|
2600 my $motif = $_[0];
|
|
2601 return 4 if length($motif) == 2;
|
|
2602 return 6 if length($motif) == 3;
|
|
2603 return 8 if length($motif) == 4;
|
|
2604
|
|
2605 }
|
|
2606
|
|
2607 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx
|
|
2608
|
|
2609
|
|
2610 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
|
|
2611 sub multispecies_filtering_compound_microsats{
|
|
2612 my $unfiltered = $_[0];
|
|
2613 my $filtered = $_[1];
|
|
2614 my $residue = $_[2];
|
|
2615 my $no_of_species = $_[5];
|
|
2616 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!";
|
|
2617 open(FIL,">$filtered") or die "Cannot open file $filtered: $!";
|
|
2618 open(RES,">$residue") or die "Cannot open file $residue: $!";
|
|
2619
|
|
2620 $infocord = 2 + (4*$no_of_species) - 1;
|
|
2621 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
2622 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
2623 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
2624 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
2625 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
2626
|
|
2627 my @sub_thresholds = ("0");
|
|
2628 push(@sub_thresholds, split(/_/,$_[3]));
|
|
2629 my @thresholds = ("0");
|
|
2630 push(@thresholds, split(/_/,$_[4]));
|
|
2631
|
|
2632 while (my $line = <UNF>) {
|
|
2633 if ($line !~ /compound/){
|
|
2634 print FIL $line,"\n"; next;
|
|
2635 }
|
|
2636 chomp $line;
|
|
2637 my @fields = split(/\t/,$line);
|
|
2638 my $motifline = $fields[$motifcord];
|
|
2639 $motifline =~ s/^\[|\]$//g;
|
|
2640 my @motifs = split(/\]\[/,$motifline);
|
|
2641 my $microsat = $fields[$microsatcord];
|
|
2642 $microsat =~ s/^\[|\]$|-//g;
|
|
2643 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat);
|
|
2644
|
|
2645 my $stopper = 0;
|
|
2646 for my $i (0 ... $#motifs){
|
|
2647 my @common = ();
|
|
2648 my $probe = $motifs[$i].$motifs[$i];
|
|
2649 my $motif_size = length($motifs[$i]);
|
|
2650
|
|
2651 for my $j (0 ... $#motifs){
|
|
2652 next if length($motifs[$i]) != length($motifs[$j]);
|
|
2653 push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i;
|
|
2654 }
|
|
2655
|
|
2656 if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;}
|
|
2657 else {next;}
|
|
2658 }
|
|
2659
|
|
2660 if ($stopper == 1){
|
|
2661 print RES $line,"\n";
|
|
2662 }
|
|
2663 else { print FIL $line,"\n"; }
|
|
2664 }
|
|
2665 close FIL;
|
|
2666 close RES;
|
|
2667 }
|
|
2668
|
|
2669 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx
|
|
2670
|
|
2671
|
|
2672 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
|
|
2673
|
|
2674 sub chromosome_unrand_breaker{
|
|
2675 # print "IN chromosome_unrand_breaker: @_\n ";
|
|
2676 my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
|
|
2677 my $dir = $_[1]; ###### directory where subsets are put
|
|
2678 my $output2 = $_[2]; ###### list of subset files
|
|
2679 my $increment = $_[3];
|
|
2680 my $info = $_[4];
|
|
2681 my $chr = $_[5];
|
|
2682 open(SEQ,"<$input1") or die "Cannot open file $input1 $!";
|
|
2683
|
|
2684 open(OUT,">$output2") or die "Cannot open file $output2 $!";
|
|
2685
|
|
2686 #---------------------------------------------------------------------------------------------------
|
|
2687 # NOW READING THE SEQUENCE FILE
|
|
2688
|
|
2689 my $seed = 0;
|
|
2690 my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
|
|
2691 print OUT $subset,"\n";
|
|
2692 open(SUB,">$subset");
|
|
2693
|
|
2694 while(my $sine = <SEQ>){
|
|
2695 $seed++;
|
|
2696 print SUB $sine;
|
|
2697
|
|
2698 if ($seed%$increment == 0 ){
|
|
2699 close SUB;
|
|
2700 $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment);
|
|
2701 open(SUB,">$subset");
|
|
2702 print SUB $sine;
|
|
2703 print OUT $subset,"\n";
|
|
2704 # print $subset,"\n";
|
|
2705 }
|
|
2706 }
|
|
2707 close OUT;
|
|
2708 close SUB;
|
|
2709 }
|
|
2710 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx
|
|
2711
|
|
2712
|
|
2713 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
|
|
2714 sub multiSpecies_interruptedMicrosatHunter{
|
|
2715 # print "IN multiSpecies_interruptedMicrosatHunter: @_\n";
|
|
2716 my $input1 = $_[0]; ###### the *_sput_op4_ii file
|
|
2717 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match"
|
|
2718 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format
|
|
2719 my $output2 = $_[3]; ###### uninterrupted microsatellite file
|
|
2720 my $org = $_[4];
|
|
2721 my $no_of_species = $_[5];
|
|
2722
|
|
2723 my @thresholds = "0";
|
|
2724 push(@thresholds, split(/_/,$_[6]));
|
|
2725
|
|
2726 # print "thresholds = @thresholds \n";
|
|
2727 $infocord = 2 + (4*$no_of_species) - 1;
|
|
2728 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
2729 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
2730 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
2731 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
2732 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
2733 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
2734 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
2735
|
|
2736 $interr_poscord = $motifcord + 3;
|
|
2737 $no_of_interruptionscord = $motifcord + 4;
|
|
2738 $interrcord = $motifcord + 2;
|
|
2739 $interrtypecord = $motifcord + 1;
|
|
2740
|
|
2741
|
|
2742 $prinkter = 0;
|
|
2743 # print "prionkytet = $prinkter\n";
|
|
2744
|
|
2745 open(IN,"<$input1") or die "Cannot open file $input1 $!";
|
|
2746 open(SEQ,"<$input2") or die "Cannot open file $input2 $!";
|
|
2747
|
|
2748 open(INT,">$output1") or die "Cannot open file $output2 $!";
|
|
2749 open(UNINT,">$output2") or die "Cannot open file $output2 $!";
|
|
2750
|
|
2751 # print "opened files !!\n";
|
|
2752 my $linecounter = 0;
|
|
2753 my $microcounter = 0;
|
|
2754
|
|
2755 my %micros = ();
|
|
2756 while (my $line = <IN>){
|
|
2757 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
2758 $linecounter++;
|
|
2759 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) {
|
|
2760 my $key = join("\t",$1, $2, $3, $4, $5);
|
|
2761 # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1;
|
|
2762 push (@{$micros{$key}},$line);
|
|
2763 $microcounter++;
|
|
2764 }
|
|
2765 else {#print $line if $prinkter == 1;
|
|
2766 }
|
|
2767 }
|
|
2768 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
2769 close IN;
|
|
2770 my @deletedlines = ();
|
|
2771 # print "done hash \n";
|
|
2772 $linecounter = 0;
|
|
2773 #---------------------------------------------------------------------------------------------------
|
|
2774 # NOW READING THE SEQUENCE FILE
|
|
2775 while(my $sine = <SEQ>){
|
|
2776 #print $linecounter,"\n" if $linecounter % 1000 == 0;
|
|
2777 my %microstart=();
|
|
2778 my %microend=();
|
|
2779 my @sields = split(/\t/,$sine);
|
|
2780 my $key = ();
|
|
2781 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
2782 $key = join("\t",$1, $2, $3, $4, $5);
|
|
2783 # print $key, "<-<-<-<-<-<-<-<\n";
|
|
2784 }
|
|
2785
|
|
2786 # $prinkter = 1 if $sine =~ /^>H\t499\t/;
|
|
2787
|
|
2788 if (exists $micros{$key}){
|
|
2789 my @microstring = @{$micros{$key}};
|
|
2790 delete $micros{$key};
|
|
2791 my @filteredmicrostring;
|
|
2792 # print "sequence = $sields[$sequencepos]" if $prinkter == 1;
|
|
2793 foreach my $line (@microstring){
|
|
2794 $linecounter++;
|
|
2795 my $copy_line = $line;
|
|
2796 my @fields = split(/\t/,$line);
|
|
2797 my $start = $fields[$startcord];
|
|
2798 my $end = $fields[$endcord];
|
|
2799
|
|
2800 # print $line if $prinkter == 1;
|
|
2801 #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE
|
|
2802 my $newline;
|
|
2803 while(1){
|
|
2804 # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1;
|
|
2805 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
|
|
2806
|
|
2807 $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org);
|
|
2808 if ($newline eq $line){$line = $newline; last;}
|
|
2809 else {$line = $newline;}
|
|
2810
|
|
2811 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;}
|
|
2812 # print "returned line from left extender= $line \n" if $prinkter == 1;
|
|
2813 }
|
|
2814 while(1){
|
|
2815 # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1;
|
|
2816 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
|
|
2817
|
|
2818 $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org);
|
|
2819 if ($newline eq $line){$line = $newline; last;}
|
|
2820 else {$line = $newline;}
|
|
2821
|
|
2822 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;}
|
|
2823 # print "returned line from right extender= $line \n" if $prinkter == 1;
|
|
2824 }
|
|
2825 # print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1;
|
|
2826
|
|
2827 my @tempfields = split(/\t/,$line);
|
|
2828 if ($tempfields[$microsatcord] =~ /\[/){
|
|
2829 print INT $line,"\n";
|
|
2830 }
|
|
2831 else{
|
|
2832 print UNINT $line,"\n";
|
|
2833 }
|
|
2834
|
|
2835 if ($line =~ /NULL/){ next; }
|
|
2836 push(@filteredmicrostring, $line);
|
|
2837 push (@{$microstart{$start}},$line);
|
|
2838 push (@{$microend{$end}},$line);
|
|
2839 }
|
|
2840
|
|
2841 my $firstflag = 'down';
|
|
2842
|
|
2843 } #if (exists $micros{$key}){
|
|
2844 }
|
|
2845 close INT;
|
|
2846 close UNINT;
|
|
2847 # print "final number of lines = $linecounter\n";
|
|
2848 }
|
|
2849
|
|
2850 sub multiSpecies_interruptedMicrosatHunter_left_extender{
|
|
2851 my ($line, $seq, $org) = @_;
|
|
2852 # print "left extender, like passed = $line\n" if $prinkter == 1;
|
|
2853 # print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1;
|
|
2854 chomp $line;
|
|
2855 my @fields = split(/\t/,$line);
|
|
2856 my $rstart = $fields[$startcord];
|
|
2857 my $microsat = $fields[$microsatcord];
|
|
2858 $microsat =~ s/\[|\]//g;
|
|
2859 my $rend = $rstart + length($microsat)-1;
|
|
2860 $microsat =~ s/-//g;
|
|
2861 my $motif = $fields[$motifcord];
|
|
2862 my $firstmotif = ();
|
|
2863
|
|
2864 if ($motif =~ /^\[/){
|
|
2865 $motif =~ s/^\[//g;
|
|
2866 $motif =~ /([a-zA-Z]+)\].*/;
|
|
2867 $firstmotif = $1;
|
|
2868 }
|
|
2869 else {$firstmotif = $motif;}
|
|
2870
|
|
2871 # print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1;
|
|
2872 my $leftphase = substr($microsat, 0,length($firstmotif));
|
|
2873 my $phaser = $leftphase.$leftphase;
|
|
2874 my @phase = split(/\s*/,$leftphase);
|
|
2875 my @phases;
|
|
2876 my @copy_phases = @phases;
|
|
2877 my $crawler=0;
|
|
2878 for (0 ... (length($leftphase)-1)){
|
|
2879 push(@phases, substr($phaser, $crawler, length($leftphase)));
|
|
2880 $crawler++;
|
|
2881 }
|
|
2882
|
|
2883 my $start = $rstart;
|
|
2884 my $end = $rend;
|
|
2885
|
|
2886 my $leftseq = substr($seq, 0, $start);
|
|
2887 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1;
|
|
2888 my @extentions = ();
|
|
2889 my @trappeds = ();
|
|
2890 my @intervalposs = ();
|
|
2891 my @trappedposs = ();
|
|
2892 my @trappedphases = ();
|
|
2893 my @intervals = ();
|
|
2894 my $firstmotif_length = length($firstmotif);
|
|
2895 foreach my $phase (@phases){
|
|
2896 # print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1;
|
|
2897 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1;
|
|
2898 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){
|
|
2899 # print "in left pattern\n" if $prinkter == 1;
|
|
2900 my $trapped = $1;
|
|
2901 my $trappedpos = length($leftseq)-length($trapped);
|
|
2902 my $interval = $3;
|
|
2903 my $intervalpos = index($trapped, $interval) + 1;
|
|
2904 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1;
|
|
2905
|
|
2906 my $extention = substr($trapped, 0, length($trapped)-length($interval));
|
|
2907 my $leftpeep = substr($seq, 0, ($start-length($trapped)));
|
|
2908 my @passed_overhangs;
|
|
2909
|
|
2910 for my $i (1 ... length($phase)-1){
|
|
2911 my $overhang = substr($phase, -length($phase)+$i);
|
|
2912 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1;
|
|
2913 #TEMPORARY... BETTER METHOD NEEDED
|
|
2914 $leftpeep =~ s/-//g;
|
|
2915 if ($leftpeep =~ /$overhang$/i){
|
|
2916 push(@passed_overhangs,$overhang);
|
|
2917 # print "l overhang\n" if $prinkter == 1;
|
|
2918 }
|
|
2919 }
|
|
2920
|
|
2921 if(scalar(@passed_overhangs)>0){
|
|
2922 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)];
|
|
2923 $extention = $overhang.$extention;
|
|
2924 $trapped = $overhang.$trapped;
|
|
2925 # print "trapped extended to $trapped \n" if $prinkter == 1;
|
|
2926 $trappedpos = length($leftseq)-length($trapped);
|
|
2927 }
|
|
2928
|
|
2929 push(@extentions,$extention);
|
|
2930 # print "extentions = @extentions \n" if $prinkter == 1;
|
|
2931
|
|
2932 push(@trappeds,$trapped );
|
|
2933 push(@intervalposs,length($extention)+1);
|
|
2934 push(@trappedposs, $trappedpos);
|
|
2935 # print "trappeds = @trappeds\n" if $prinkter == 1;
|
|
2936 push(@trappedphases, substr($extention,0,length($phase)));
|
|
2937 push(@intervals, $interval);
|
|
2938 }
|
|
2939 }
|
|
2940 if (scalar(@trappeds == 0)) {return $line;}
|
|
2941
|
|
2942 ############################ my $nikaal = longest_array_element(@trappeds);
|
|
2943 my $nikaal = shortest_array_element(@intervals);
|
|
2944
|
|
2945 # print "longest element found = $nikaal \n" if $prinkter == 1;
|
|
2946
|
|
2947 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
|
|
2948 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord];
|
|
2949 #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
|
|
2950 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]);
|
|
2951
|
|
2952 #print "new fields 9 = $fields[9]\n" if $prinkter == 1;
|
|
2953
|
|
2954 if($fields[$microsatcord] !~ /^\[/i){
|
|
2955 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
|
|
2956 }
|
|
2957
|
|
2958 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord];
|
|
2959 #print "new fields 14 = $fields[12]\n" if $prinkter == 1;
|
|
2960
|
|
2961 #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1;
|
|
2962
|
|
2963
|
|
2964 if (scalar(@fields) > $motifcord+1){
|
|
2965 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1];
|
|
2966 }
|
|
2967 else{$fields[$motifcord+1] = "indel/deletion";}
|
|
2968 #print "new fields 14 = $fields[14]\n" if $prinkter == 1;
|
|
2969
|
|
2970 if (scalar(@fields)>$motifcord+2){
|
|
2971 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2];
|
|
2972 }
|
|
2973 else{$fields[$motifcord+2] = $intervals[$nikaal];}
|
|
2974 #print "new fields 15 = $fields[15]\n" if $prinkter == 1;
|
|
2975
|
|
2976 my @seventeen=();
|
|
2977
|
|
2978 if (scalar(@fields)>$motifcord+3){
|
|
2979 @seventeen = split(/,/,$fields[$motifcord+3]);
|
|
2980 # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1;
|
|
2981 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);}
|
|
2982 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen);
|
|
2983 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
|
|
2984 }
|
|
2985
|
|
2986 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1}
|
|
2987
|
|
2988 #print "new fields 16 = $fields[16]\n" if $prinkter == 1;
|
|
2989 #print "new fields 17 = $fields[17]\n" if $prinkter == 1;
|
|
2990
|
|
2991 # return join("\t",@fields);
|
|
2992 my $returnline = join("\t",@fields);
|
|
2993 my $pastline = $returnline;
|
|
2994 if ($fields[$microsatcord] =~ /\[/){
|
|
2995 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
|
|
2996 }
|
|
2997 # print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1;
|
|
2998 return $returnline;
|
|
2999 }
|
|
3000
|
|
3001 sub multiSpecies_interruptedMicrosatHunter_right_extender{
|
|
3002 # print "right extender\n" if $prinkter == 1;
|
|
3003 my ($line, $seq, $org) = @_;
|
|
3004 # print "in right extender... line passed = $line\n" if $prinkter == 1;
|
|
3005 # print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1;
|
|
3006 chomp $line;
|
|
3007 my @fields = split(/\t/,$line);
|
|
3008 my $rstart = $fields[$startcord];
|
|
3009 my $microsat = $fields[$microsatcord];
|
|
3010 $microsat =~ s/\[|\]//g;
|
|
3011 my $rend = $rstart + length($microsat)-1;
|
|
3012 $microsat =~ s/-//g;
|
|
3013 my $motif = $fields[$motifcord];
|
|
3014 my $temp_lastmotif = ();
|
|
3015
|
|
3016 if ($motif =~ /\]$/){
|
|
3017 $motif =~ s/\]$//g;
|
|
3018 $motif =~ /.*\[([a-zA-Z]+)/;
|
|
3019 $temp_lastmotif = $1;
|
|
3020 }
|
|
3021 else {$temp_lastmotif = $motif;}
|
|
3022 my $lastmotif = substr($microsat,-length($temp_lastmotif));
|
|
3023 # print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1;
|
|
3024 my $rightphase = substr($microsat, -length($lastmotif));
|
|
3025 my $phaser = $rightphase.$rightphase;
|
|
3026 my @phase = split(/\s*/,$rightphase);
|
|
3027 my @phases;
|
|
3028 my @copy_phases = @phases;
|
|
3029 my $crawler=0;
|
|
3030 for (0 ... (length($rightphase)-1)){
|
|
3031 push(@phases, substr($phaser, $crawler, length($rightphase)));
|
|
3032 $crawler++;
|
|
3033 }
|
|
3034
|
|
3035 my $start = $rstart;
|
|
3036 my $end = $rend;
|
|
3037
|
|
3038 my $rightseq = substr($seq, $end+1);
|
|
3039 # print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1;
|
|
3040 # print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1;
|
|
3041 my @extentions = ();
|
|
3042 my @trappeds = ();
|
|
3043 my @intervalposs = ();
|
|
3044 my @trappedposs = ();
|
|
3045 my @trappedphases = ();
|
|
3046 my @intervals = ();
|
|
3047 my $lastmotif_length = length($lastmotif);
|
|
3048 foreach my $phase (@phases){
|
|
3049 # print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1;
|
|
3050 # print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1;
|
|
3051 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){
|
|
3052 # print "in right pattern\n" if $prinkter == 1;
|
|
3053 my $trapped = $1;
|
|
3054 my $trappedpos = $end+1;
|
|
3055 my $interval = $2;
|
|
3056 my $intervalpos = index($trapped, $interval) + 1;
|
|
3057 # print "trapped = $trapped, interval = $interval\n" if $prinkter == 1;
|
|
3058
|
|
3059 my $extention = substr($trapped, length($interval));
|
|
3060 my $rightpeep = substr($seq, ($end+length($trapped))+1);
|
|
3061 my @passed_overhangs = "";
|
|
3062
|
|
3063 #TEMPORARY... BETTER METHOD NEEDED
|
|
3064 $rightpeep =~ s/-//g;
|
|
3065
|
|
3066 for my $i (1 ... length($phase)-1){
|
|
3067 my $overhang = substr($phase,0, $i);
|
|
3068 # print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1;
|
|
3069 if ($rightpeep =~ /^$overhang/i){
|
|
3070 push(@passed_overhangs, $overhang);
|
|
3071 # print "r overhang\n" if $prinkter == 1;
|
|
3072 }
|
|
3073 }
|
|
3074 if (scalar(@passed_overhangs) > 0){
|
|
3075 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)];
|
|
3076 $extention = $extention.$overhang;
|
|
3077 $trapped = $trapped.$overhang;
|
|
3078 # print "trapped extended to $trapped \n" if $prinkter == 1;
|
|
3079 }
|
|
3080
|
|
3081 push(@extentions,$extention);
|
|
3082 #print "extentions = @extentions \n" if $prinkter == 1;
|
|
3083
|
|
3084 push(@trappeds,$trapped );
|
|
3085 push(@intervalposs,$intervalpos);
|
|
3086 push(@trappedposs, $trappedpos);
|
|
3087 # print "trappeds = @trappeds\n" if $prinkter == 1;
|
|
3088 push(@trappedphases, substr($extention,0,length($phase)));
|
|
3089 push(@intervals, $interval);
|
|
3090 }
|
|
3091 }
|
|
3092 if (scalar(@trappeds == 0)) {return $line;}
|
|
3093
|
|
3094 ################################### my $nikaal = longest_array_element(@trappeds);
|
|
3095 my $nikaal = shortest_array_element(@intervals);
|
|
3096
|
|
3097 # print "longest element found = $nikaal \n" if $prinkter == 1;
|
|
3098
|
|
3099 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";}
|
|
3100 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]";
|
|
3101 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]);
|
|
3102
|
|
3103
|
|
3104 if($fields[$microsatcord] !~ /^\[/i){
|
|
3105 $fields[$microsatcord] = "[".$fields[$microsatcord]."]";
|
|
3106 }
|
|
3107
|
|
3108 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]";
|
|
3109
|
|
3110
|
|
3111 if (scalar(@fields) > $motifcord+1){
|
|
3112 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion";
|
|
3113 }
|
|
3114 else{$fields[$motifcord+1] = "indel/deletion";}
|
|
3115
|
|
3116 if (scalar(@fields)>$motifcord+2){
|
|
3117 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal];
|
|
3118 }
|
|
3119 else{$fields[$motifcord+2] = $intervals[$nikaal];}
|
|
3120
|
|
3121 my @seventeen=();
|
|
3122 if (scalar(@fields)>$motifcord+3){
|
|
3123 #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1;
|
|
3124 my $currpos = length($microsat)+$intervalposs[$nikaal];
|
|
3125 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos;
|
|
3126 $fields[$motifcord+4] = $fields[$motifcord+4]+1;
|
|
3127
|
|
3128 }
|
|
3129
|
|
3130 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1}
|
|
3131
|
|
3132 # print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1;
|
|
3133 # return join("\t",@fields);
|
|
3134
|
|
3135 my $returnline = join("\t",@fields);
|
|
3136 my $pastline = $returnline;
|
|
3137 if ($fields[$microsatcord] =~ /\[/){
|
|
3138 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline);
|
|
3139 }
|
|
3140 # print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1;
|
|
3141 return $returnline;
|
|
3142
|
|
3143 }
|
|
3144
|
|
3145 sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{
|
|
3146 my @fields = split(/\t/,$_[0]);
|
|
3147 my $microsat = $fields[$microsatcord];
|
|
3148 $microsat =~ s/(^\[)|-//sg;
|
|
3149 my $motif = $fields[$motifcord];
|
|
3150 chomp $motif;
|
|
3151 # print $motif, "\n" if $motif !~ /^\[/;
|
|
3152 my $firstmotif = ();
|
|
3153 my $firststretch = ();
|
|
3154 my @stretches=();
|
|
3155
|
|
3156 # print "motif = $motif, microsat = $microsat\n" if $prinkter == 1;
|
|
3157 if ($motif =~ /^\[/){
|
|
3158 $motif =~ s/^\[//sg;
|
|
3159 $motif =~ /([a-zA-Z]+)\].*/;
|
|
3160 $firstmotif = $1;
|
|
3161 @stretches = split(/\]/,$microsat);
|
|
3162 $firststretch = $stretches[0];
|
|
3163 #print "firststretch = $firststretch\n" if $prinkter == 1;
|
|
3164 }
|
|
3165 else {$firstmotif = $motif;$firststretch = $microsat;}
|
|
3166 # print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1;
|
|
3167 if (length($firststretch) < $thresholds[length($firstmotif)]){
|
|
3168 return "no";
|
|
3169 }
|
|
3170 else {return "yes";}
|
|
3171
|
|
3172 }
|
|
3173 sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{
|
|
3174 my @fields = split(/\t/,$_[0]);
|
|
3175 my $microsat = $fields[$microsatcord];
|
|
3176 $microsat =~ s/-|(\]$)//sg;
|
|
3177 my $motif = $fields[$motifcord];
|
|
3178 chomp $motif;
|
|
3179 my $temp_lastmotif = ();
|
|
3180 my $laststretch = ();
|
|
3181 my @stretches=();
|
|
3182
|
|
3183
|
|
3184 if ($motif =~ /\]/){
|
|
3185 $motif =~ s/\]$//sg;
|
|
3186 $motif =~ /.*\[([a-zA-Z]+)$/;
|
|
3187 $temp_lastmotif = $1;
|
|
3188 @stretches = split(/\[/,$microsat);
|
|
3189 $laststretch = pop(@stretches);
|
|
3190 #print "last stretch = $laststretch\n" if $prinkter == 1;
|
|
3191 }
|
|
3192 else {$temp_lastmotif = $motif; $laststretch = $microsat;}
|
|
3193
|
|
3194 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){
|
|
3195 return "no";
|
|
3196 }
|
|
3197 else { return "yes";}
|
|
3198
|
|
3199
|
|
3200 }
|
|
3201 sub checking_substitutions{
|
|
3202
|
|
3203 my ($line, $seq, $startprobes, $endprobes) = @_;
|
|
3204 #print "sequence = $seq \n" if $prinkter == 1;
|
|
3205 #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n";
|
|
3206 # <STDIN>;
|
|
3207 my @seqarray = split(/\s*/,$seq);
|
|
3208 my @startsubst_probes = split(/\|/,$startprobes);
|
|
3209 my @endsubst_probes = split(/\|/,$endprobes);
|
|
3210 chomp $line;
|
|
3211 my @fields = split(/\t/,$line);
|
|
3212 my $start = $fields[11] - $fields[10];
|
|
3213 my $end = $fields[13] - $fields[10];
|
|
3214 my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9]
|
|
3215 $motif =~ s/\[|\]//g;
|
|
3216 my $microsat = $fields[14];
|
|
3217 $microsat =~ s/\[|\]//g;
|
|
3218 #------------------------------------------------------------------------
|
|
3219 # GETTING START AND END PHASES
|
|
3220 my $startphase = substr($microsat,0, length($motif));
|
|
3221 my $endphase = substr($microsat,-length($motif), length($motif));
|
|
3222 #print "start and end phases are - $startphase and $endphase\n";
|
|
3223 my $startflag = 'down';
|
|
3224 my $endflag = 'down';
|
|
3225 my $substitution_distance = length($motif);
|
|
3226 my $prestart = $start - $substitution_distance;
|
|
3227 my $postend = $end + $substitution_distance;
|
|
3228 my @endadds = ();
|
|
3229 my @startadds = ();
|
|
3230 if (($prestart < 0) || ($postend > scalar(@seqarray))) {
|
|
3231 last;
|
|
3232 }
|
|
3233 #------------------------------------------------------------------------#------------------------------------------------------------------------
|
|
3234 # CHECKING FOR SUBSTITUTION PROBES NOW
|
|
3235
|
|
3236 if ($fields[8] ne "mononucleotide"){
|
|
3237 while ($startflag eq "down"){
|
|
3238 my $search = join("",@seqarray[$prestart...($start-1)]);
|
|
3239 #print "search is from $prestart...($start-1) = $search\n";
|
|
3240 foreach my $probe (@startsubst_probes){
|
|
3241 #print "\t\tprobe = $probe\n";
|
|
3242 if ($search =~ /^$probe/){
|
|
3243 #print "\tfound addition to the left - $search \n";
|
|
3244 my $copyprobe = $probe;
|
|
3245 my $type;
|
|
3246 my $subspos = 0;
|
|
3247 my $interruption = "";
|
|
3248 if ($search eq $startphase) { $type = "NONE";}
|
|
3249 else{
|
|
3250 $copyprobe =~ s/\[a-zA-Z\]/^/g;
|
|
3251 $subspos = index($copyprobe,"^") + 1;
|
|
3252 $type = "substitution";
|
|
3253 $interruption = substr($search, $subspos,1);
|
|
3254 }
|
|
3255 my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos);
|
|
3256 #print "adding information: $addinfo \n";
|
|
3257 push(@startadds, $addinfo);
|
|
3258 $prestart = $prestart - $substitution_distance;
|
|
3259 $start = $start-$substitution_distance;
|
|
3260 $startflag = 'down';
|
|
3261
|
|
3262 last;
|
|
3263 }
|
|
3264 else{
|
|
3265 $startflag = 'up';
|
|
3266 }
|
|
3267 }
|
|
3268 }
|
|
3269 #<STDIN>;
|
|
3270 while ($endflag eq "down"){
|
|
3271 my $search = join("",@seqarray[($end+1)...$postend]);
|
|
3272 #print "search is from ($end+1)...$postend] = $search\n";
|
|
3273
|
|
3274 foreach my $probe (@endsubst_probes){
|
|
3275 #print "\t\tprobe = $probe\n";
|
|
3276 if ($search =~ /$probe$/){
|
|
3277 my $copyprobe = $probe;
|
|
3278 my $type;
|
|
3279 my $subspos = 0;
|
|
3280 my $interruption = "";
|
|
3281 if ($search eq $endphase) { $type = "NONE";}
|
|
3282 else{
|
|
3283 $copyprobe =~ s/\[a-zA-Z\]/^/g;
|
|
3284 $subspos = index($copyprobe,"^") + 1;
|
|
3285 $type = "substitution";
|
|
3286 $interruption = substr($search, $subspos,1);
|
|
3287 }
|
|
3288 my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos);
|
|
3289 #print "adding information: $addinfo \n";
|
|
3290 push(@endadds, $addinfo);
|
|
3291 $postend = $postend + $substitution_distance;
|
|
3292 $end = $end+$substitution_distance;
|
|
3293 push(@endadds, $search);
|
|
3294 $endflag = 'down';
|
|
3295 last;
|
|
3296 }
|
|
3297 else{
|
|
3298 $endflag = 'up';
|
|
3299 }
|
|
3300 }
|
|
3301 }
|
|
3302 #print "startadds = @startadds, endadds = @endadds \n";
|
|
3303
|
|
3304 }
|
|
3305 }
|
|
3306 sub microsat_packer{
|
|
3307 my $microsat = $_[0];
|
|
3308 my $addition = $_[1];
|
|
3309
|
|
3310
|
|
3311
|
|
3312 }
|
|
3313 sub multiSpecies_interruptedMicrosatHunter_merge{
|
|
3314 $prinkter = 0;
|
|
3315 # print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n";
|
|
3316 my $line = $_[0];
|
|
3317 # print "sent for mering: $line \n" if $prinkter ==1;
|
|
3318 my @mields = split(/\t/,$line);
|
|
3319 my @fields = @mields;
|
|
3320 my $microsat = allCaps($fields[$microsatcord]);
|
|
3321 my $motifline = allCaps($fields[$motifcord]);
|
|
3322 my $microsatcopy = $microsat;
|
|
3323 # print "microsat = $microsat|\n" if $prinkter ==1;
|
|
3324 $microsatcopy =~ s/^\[|\]$//sg;
|
|
3325 chomp $microsatcopy;
|
|
3326 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy);
|
|
3327 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat);
|
|
3328 shift @inields;
|
|
3329 # print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1;
|
|
3330 $motifline =~ s/^\[|\]$//sg;
|
|
3331 my @motields = split(/\]\[/,$motifline);
|
|
3332 my @firstmotifs = ();
|
|
3333 my @lastmotifs = ();
|
|
3334 for my $i (0 ... $#microields){
|
|
3335 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i]));
|
|
3336 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i]));
|
|
3337 }
|
|
3338 # print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1;
|
|
3339 my @mergelist = ();
|
|
3340 my @inter_poses = split(/,/,$fields[$interr_poscord]);
|
|
3341 my $no_of_interruptions = $fields[$no_of_interruptionscord];
|
|
3342 my @interruptions = split(/,/,$fields[$interrcord]);
|
|
3343 my @interrtypes = split(/,/,$fields[$interrtypecord]);
|
|
3344 my $stopper = 0;
|
|
3345 for my $i (0 ... $#motields-1){
|
|
3346 # print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1;
|
|
3347 if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){
|
|
3348 $stopper = 1;
|
|
3349 push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1;
|
|
3350 }
|
|
3351 }
|
|
3352
|
|
3353 # print "mergelist = @mergelist\n" if $prinkter ==1;
|
|
3354 return $line if scalar(@mergelist) == 0;
|
|
3355 # print "merging @mergelist\n" if $prinkter ==1;
|
|
3356 # <STDIN> if $prinkter ==1;
|
|
3357
|
|
3358 foreach my $merging (@mergelist){
|
|
3359 my @sets = split(/_/, $merging);
|
|
3360 # print "sets = @sets\n" if $prinkter ==1;
|
|
3361 my @tempmicro = ();
|
|
3362 my @tempmot = ();
|
|
3363 # print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1;
|
|
3364 for my $i (0 ... $sets[0]-1){
|
|
3365 # print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1;
|
|
3366 push(@tempmicro, "[".$microields[$i]."]");
|
|
3367 push(@tempmicro, $inields[$i]);
|
|
3368 push(@tempmot, "[".$motields[$i]."]");
|
|
3369 # print "adding pre-motifs number $i\n" if $prinkter ==1;
|
|
3370 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
|
|
3371 }
|
|
3372 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
|
|
3373 # print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1;
|
|
3374 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]";
|
|
3375 # print "middle is, from @motields - @sets, number 0 which is is\n";
|
|
3376 # print ": $motields[$sets[0]]\n";
|
|
3377 push (@tempmicro, $pusher);
|
|
3378 push(@tempmot, "[".$motields[$sets[0]]."]");
|
|
3379 push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]];
|
|
3380 my $outcoming = -2;
|
|
3381 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1;
|
|
3382 # print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1;
|
|
3383 for my $i ($sets[1]+1 ... $#microields){
|
|
3384 # print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1;
|
|
3385 push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i];
|
|
3386 push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i];
|
|
3387 push(@tempmot, "[".$motields[$i]."]");
|
|
3388 # print "adding post-motifs number $i\n" if $prinkter ==1;
|
|
3389 $outcoming = $i;
|
|
3390 }
|
|
3391 # print "____________________________________________________________________________\n";
|
|
3392 $prinkter = 0;
|
|
3393 $fields[$microsatcord] = join("",@tempmicro);
|
|
3394 $fields[$motifcord] = join("",@tempmot);
|
|
3395 # print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1;
|
|
3396
|
|
3397 splice(@interrtypes, $sets[0], 1);
|
|
3398 $fields[$interrtypecord] = join(",",@interrtypes);
|
|
3399 splice(@interruptions, $sets[0], 1);
|
|
3400 $fields[$interrcord] = join(",",@interruptions);
|
|
3401 splice(@inter_poses, $sets[0], 1);
|
|
3402 $fields[$interr_poscord] = join(",",@inter_poses);
|
|
3403 $no_of_interruptions = $no_of_interruptions - 1;
|
|
3404 }
|
|
3405
|
|
3406 if ($no_of_interruptions == 0 && $line !~ /compound/){
|
|
3407 $fields[$microsatcord] =~ s/^\[|\]$//sg;
|
|
3408 $fields[$motifcord] =~ s/^\[|\]$//sg;
|
|
3409 $line = join("\t", @fields[0 ... $motifcord]);
|
|
3410 }
|
|
3411 else{
|
|
3412 $line = join("\t", @fields);
|
|
3413 }
|
|
3414 # print "post merging, the line is $line\n" if $prinkter ==1;
|
|
3415 #<STDIN> if $stopper ==1;
|
|
3416 return $line;
|
|
3417 }
|
|
3418 sub interval_asseser{
|
|
3419 my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3];
|
|
3420 }
|
|
3421 #---------------------------------------------------------------------------------------------------
|
|
3422 sub allCaps{
|
|
3423 my $motif = $_[0];
|
|
3424 $motif =~ s/a/A/g;
|
|
3425 $motif =~ s/c/C/g;
|
|
3426 $motif =~ s/t/T/g;
|
|
3427 $motif =~ s/g/G/g;
|
|
3428 return $motif;
|
|
3429 }
|
|
3430
|
|
3431
|
|
3432 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx
|
|
3433
|
|
3434
|
|
3435 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
|
|
3436 sub merge_interruptedMicrosats{
|
|
3437 # print "IN merge_interruptedMicrosats: @_\n";
|
|
3438 my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2"
|
|
3439 my $input1 = $_[1]; ###### the *_sput_op4_ii file
|
|
3440 my $input2 = $_[2]; ###### the *_sput_op4_ii file
|
|
3441 $no_of_species = $_[3];
|
|
3442
|
|
3443 my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward
|
|
3444 my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse
|
|
3445 my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward
|
|
3446 #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse
|
|
3447 #my $info = $_[4];
|
|
3448 #my @tags = split(/\t/,$info);
|
|
3449
|
|
3450 open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
|
|
3451 open(INF,"<$input1") or die "Cannot open file $input1 $!";
|
|
3452 open(INR,"<$input2") or die "Cannot open file $input2 $!";
|
|
3453 open(OUTF,">$output1") or die "Cannot open file $output1 $!";
|
|
3454 open(OUTR,">$output2") or die "Cannot open file $output2 $!";
|
|
3455 open(MER,">$output3") or die "Cannot open file $output3 $!";
|
|
3456 #open(MERR,">$output4") or die "Cannot open file $output4 $!";
|
|
3457
|
|
3458
|
|
3459
|
|
3460 $printer = 0;
|
|
3461
|
|
3462 # print "files opened \n";
|
|
3463 $infocord = 2 + (4*$no_of_species) - 1;
|
|
3464 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
3465 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
3466 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
3467 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
3468 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
3469 $typecord = $infocord + 1;
|
|
3470 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
3471
|
|
3472 $interrtypecord = $motifcord + 1;
|
|
3473 $interrcord = $motifcord + 2;
|
|
3474 $interr_poscord = $motifcord + 3;
|
|
3475 $no_of_interruptionscord = $motifcord + 4;
|
|
3476 $mergestarts = $no_of_interruptionscord+ 1;
|
|
3477 $mergeends = $no_of_interruptionscord+ 2;
|
|
3478 $mergemicros = $no_of_interruptionscord+ 3;
|
|
3479
|
|
3480 # NOW ADDING FORWARD MICROSATELLITES TO HASH
|
|
3481 my %fmicros = ();
|
|
3482 my $microcounter=0;
|
|
3483 my $linecounter = 0;
|
|
3484 while (my $line = <INF>){
|
|
3485 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
3486 $linecounter++;
|
|
3487 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
3488 my $key = join("\t",$1, $2, $4, $5);
|
|
3489 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
3490 push (@{$fmicros{$key}},$line);
|
|
3491 $microcounter++;
|
|
3492 }
|
|
3493 else {
|
|
3494 #print $line;
|
|
3495 }
|
|
3496 }
|
|
3497 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
3498 close INF;
|
|
3499 my @deletedlines = ();
|
|
3500 # print "done forward hash \n";
|
|
3501 $linecounter = 0;
|
|
3502 #---------------------------------------------------------------------------------------------------
|
|
3503 # NOW ADDING REVERSE MICROSATELLITES TO HASH
|
|
3504 my %rmicros = ();
|
|
3505 $microcounter=0;
|
|
3506 while (my $line = <INR>){
|
|
3507 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
3508 $linecounter++;
|
|
3509 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
3510 my $key = join("\t",$1, $2, $4, $5);
|
|
3511 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
3512 push (@{$rmicros{$key}},$line);
|
|
3513 $microcounter++;
|
|
3514 }
|
|
3515 else {
|
|
3516 #print "cant make key\n";
|
|
3517 }
|
|
3518 }
|
|
3519 # print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
3520 close INR;
|
|
3521 # print "done reverse hash \n";
|
|
3522 $linecounter = 0;
|
|
3523
|
|
3524 #------------------------------------------------------------------------------------------------
|
|
3525
|
|
3526 while(my $sine = <SEQ>){
|
|
3527 #<STDIN> if $sine =~ /16349128/;
|
|
3528 next if $sine !~ /[a-zA-Z0-9]/;
|
|
3529 # print "-" x 150, "\n" if $printer == 1;
|
|
3530 my @sields = split(/\t/,$sine);
|
|
3531 my @merged = ();
|
|
3532
|
|
3533 my $key = ();
|
|
3534
|
|
3535 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
3536 $key = join("\t",$1, $2, $4, $5);
|
|
3537 # print $key, "<-<-<-<-<-<-<-<\n";
|
|
3538 }
|
|
3539 # print "key = $key\n";
|
|
3540
|
|
3541 my @sets1;
|
|
3542 my @sets2;
|
|
3543 chomp $sields[$sequencepos];
|
|
3544 my $rev_sequence = reverse($sields[$sequencepos]);
|
|
3545 $rev_sequence =~ s/ //g;
|
|
3546 $rev_sequence = " ".$rev_sequence;
|
|
3547 next if (!exists $fmicros{$key} && !exists $rmicros{$key});
|
|
3548
|
|
3549 if (exists $fmicros{$key}){
|
|
3550 # print "line no : $linecount\n";
|
|
3551 my @raw_microstring = @{$fmicros{$key}};
|
|
3552 my %starts = (); my %ends = ();
|
|
3553 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
|
|
3554 my @microstring=();
|
|
3555 for my $u (0 ... $#raw_microstring){
|
|
3556 my @tields = split(/\t/,$raw_microstring[$u]);
|
|
3557 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
|
|
3558 push(@microstring, $raw_microstring[$u]);
|
|
3559 $starts{$tields[$startcord]} = $tields[$startcord];
|
|
3560 $ends{$tields[$endcord]} = $tields[$endcord];
|
|
3561 }
|
|
3562
|
|
3563 # print "founf microstring in forward\n: @microstring\n";
|
|
3564 chomp @microstring;
|
|
3565 my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos]));
|
|
3566 @sets1 = split("\=", $clusterresult);
|
|
3567 my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp);
|
|
3568 # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>;
|
|
3569 } #if (exists $micros{$key}){
|
|
3570
|
|
3571 if (exists $rmicros{$key}){
|
|
3572 # print "line no : $linecount\n";
|
|
3573 my @raw_microstring = @{$rmicros{$key}};
|
|
3574 my %starts = (); my %ends = ();
|
|
3575 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;}
|
|
3576 my @microstring=();
|
|
3577 for my $u (0 ... $#raw_microstring){
|
|
3578 my @tields = split(/\t/,$raw_microstring[$u]);
|
|
3579 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
|
|
3580 push(@microstring, $raw_microstring[$u]);
|
|
3581 $starts{$tields[$startcord]} = $tields[$startcord];
|
|
3582 $ends{$tields[$endcord]} = $tields[$endcord];
|
|
3583 }
|
|
3584 # print "founf microstring in reverse\n: @microstring\n"; <STDIN>;
|
|
3585 chomp @microstring;
|
|
3586 # print "sending reversed sequence\n";
|
|
3587 my $clusterresult = (find_clusters(@microstring, $rev_sequence ) );
|
|
3588 @sets2 = split("\=", $clusterresult);
|
|
3589 my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp);
|
|
3590 } #if (exists $micros{$key}){
|
|
3591
|
|
3592 my @popout1 = ();
|
|
3593 my @popout2 = ();
|
|
3594 my @forwardset = ();
|
|
3595 if (exists $sets2[1] ){
|
|
3596 if(exists $sets1[0]) {
|
|
3597 push (@popout1, $sets1[0],$sets2[1]);
|
|
3598 my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));#
|
|
3599 print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n";
|
|
3600 my @localmerged = split("_X0X_", $forwardset[1]);
|
|
3601 my $sequence = $sields[$sequencepos];
|
|
3602 $sequence =~ s/ //g;
|
|
3603 # print "\nforwardset = @forwardset\n";
|
|
3604 for my $j (0 ... $#localmerged){
|
|
3605 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
|
|
3606 }
|
|
3607
|
|
3608 push (@merged, @localmerged);
|
|
3609
|
|
3610 }
|
|
3611 else{
|
|
3612 my @localmerged = split("_X0X_", $sets2[1]);
|
|
3613 my $sequence = $sields[$sequencepos];
|
|
3614 $sequence =~ s/ //g;
|
|
3615 for my $j (0 ... $#localmerged){
|
|
3616 # print "\nlocalmerged = @localmerged\n";
|
|
3617 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence));
|
|
3618 }
|
|
3619
|
|
3620 push (@merged, @localmerged);
|
|
3621 }
|
|
3622 }
|
|
3623 elsif (exists $sets1[0]){
|
|
3624 print OUTF join("\n",split("_X0X_", $sets1[0])), "\n";
|
|
3625 }
|
|
3626
|
|
3627 my @reverseset= ();
|
|
3628 if (exists $sets1[1]){
|
|
3629 if (exists $sets2[0]){
|
|
3630 push (@popout2, $sets2[0],$sets1[1]);
|
|
3631 # print "popout2 = @popout2\n";
|
|
3632 my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos]));
|
|
3633 #print "reverseset = $reverseset[1] < --- reverseset1\n";
|
|
3634 print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n";
|
|
3635 push(@merged, (split("_X0X_", $reverseset[1])));
|
|
3636 }
|
|
3637 else{
|
|
3638 push(@merged, (split("_X0X_", $sets1[1])));
|
|
3639 }
|
|
3640 }
|
|
3641 elsif (exists $sets2[0]){
|
|
3642 print OUTR join("\n",split("_X0X_", $sets2[0])), "\n";
|
|
3643
|
|
3644 }
|
|
3645
|
|
3646 if (scalar @merged > 0){
|
|
3647 my @filtered_merged = split("__",(filterDuplicates_merged(@merged)));
|
|
3648 print MER join("\n", @filtered_merged),"\n";
|
|
3649 }
|
|
3650 # <STDIN> if $sine =~ /16349128/;
|
|
3651
|
|
3652 }
|
|
3653 close(SEQ);
|
|
3654 close(INF);
|
|
3655 close(INR);
|
|
3656 close(OUTF);
|
|
3657 close(OUTR);
|
|
3658 close(MER);
|
|
3659
|
|
3660 }
|
|
3661 sub find_clusters{
|
|
3662 my @input = @_;
|
|
3663 my $sequence = pop(@input);
|
|
3664 $sequence =~ s/ //g;
|
|
3665 my @microstring0 = @input;
|
|
3666 # print "IN: find_clusters:\n";
|
|
3667 my %microstart=();
|
|
3668 my %microend=();
|
|
3669 my @nonmerged = ();
|
|
3670 my @mergedSet = ();
|
|
3671 # print "set of microsats = @microstring \n";
|
|
3672 my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0;
|
|
3673 # print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1;
|
|
3674 #<STDIN> if $printer == 1;
|
|
3675 my @tempmicrostring = @microstring;
|
|
3676 foreach my $line (@tempmicrostring){
|
|
3677 my @fields = split(/\t/,$line);
|
|
3678 my $start = $fields[$startcord];
|
|
3679 my $end = $fields[$endcord];
|
|
3680 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/;
|
|
3681 # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
|
|
3682 push (@{$microstart{$start}},$line);
|
|
3683 push (@{$microend{$end}},$line);
|
|
3684 }
|
|
3685 my $firstflag = 'down';
|
|
3686 while( my $line =shift(@microstring)){
|
|
3687 # print "-----------\nline = $line \n" if $printer == 1;
|
|
3688 chomp $line;
|
|
3689 my @fields = split(/\t/,$line);
|
|
3690 my $start = $fields[$startcord];
|
|
3691 my $end = $fields[$endcord];
|
|
3692 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ;
|
|
3693 my $startmicro = $line;
|
|
3694 my $endmicro = $line;
|
|
3695 # print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n";
|
|
3696
|
|
3697 delete ($microstart{$start});
|
|
3698 delete ($microend{$end});
|
|
3699 my $flag = 'down';
|
|
3700 my $startflag = 'down';
|
|
3701 my $endflag = 'down';
|
|
3702 my $prestart = $start - $distance;
|
|
3703 my $postend = $end + $distance;
|
|
3704 my @compoundlines = ();
|
|
3705 my %compoundhash = ();
|
|
3706 push (@compoundlines, $line);
|
|
3707 push (@{$compoundhash{$line}},$line);
|
|
3708 my $startrank = 1;
|
|
3709 my $endrank = 1;
|
|
3710
|
|
3711 while( ($startflag eq "down") || ($endflag eq "down") ){
|
|
3712 # print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1;
|
|
3713 if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){
|
|
3714 # print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1;
|
|
3715 last;
|
|
3716 }
|
|
3717
|
|
3718 $firstflag = "up";
|
|
3719 if ($startflag eq "down"){
|
|
3720 for my $i ($prestart ... $end){
|
|
3721 if(exists $microend{$i}){
|
|
3722 chomp $microend{$i}[0];
|
|
3723 if(exists $compoundhash{$microend{$i}[0]}) {next;}
|
|
3724 chomp $microend{$i}[0];
|
|
3725 push(@compoundlines, $microend{$i}[0]);
|
|
3726 my @tields = split(/\t/,$microend{$i}[0]);
|
|
3727 $startmicro = $microend{$i}[0];
|
|
3728 chomp $startmicro;
|
|
3729 $flag = 'down';
|
|
3730 $startrank++;
|
|
3731 # print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1;
|
|
3732 delete $microend{$i};
|
|
3733 delete $microstart{$tields[$startcord]};
|
|
3734 $end = $tields[$endcord];
|
|
3735 $startflag = 'down';
|
|
3736 $prestart = $tields[$startcord] - $distance;
|
|
3737 last;
|
|
3738 }
|
|
3739 else{
|
|
3740 $flag = 'up';
|
|
3741 $startflag = 'up';
|
|
3742 }
|
|
3743 }
|
|
3744 }
|
|
3745
|
|
3746 if ($endflag eq "down"){
|
|
3747
|
|
3748 for my $i ($start ... $postend){
|
|
3749 # print "$start ----> $i -----> $postend\n" if $printer == 1;
|
|
3750 if(exists $microstart{$i} ){
|
|
3751 chomp $microstart{$i}[0];
|
|
3752 if(exists $compoundhash{$microstart{$i}[0]}) {next;}
|
|
3753 chomp $microstart{$i}[0];
|
|
3754 push(@compoundlines, $microstart{$i}[0]);
|
|
3755 my @tields = split(/\t/,$microstart{$i}[0]);
|
|
3756 $endmicro = $microstart{$i}[0];
|
|
3757 $endrank++;
|
|
3758 chomp $endmicro;
|
|
3759 $flag = 'down';
|
|
3760 # print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1;
|
|
3761
|
|
3762 delete $microstart{$i} if exists $microstart{$i} ;
|
|
3763 delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]};
|
|
3764 # print "done\n" if $printer == 1;
|
|
3765
|
|
3766 shift @microstring;
|
|
3767 $end = $tields[$endcord];
|
|
3768 $postend = $tields[$endcord] + $distance;
|
|
3769 $endflag = 'down';
|
|
3770 last;
|
|
3771 }
|
|
3772 else{
|
|
3773 $flag = 'up';
|
|
3774 $endflag = 'up';
|
|
3775 }
|
|
3776 # print "out of the if\n" if $printer == 1;
|
|
3777 }
|
|
3778 # print "out of the for\n" if $printer == 1;
|
|
3779
|
|
3780 }
|
|
3781 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n";
|
|
3782 } #end while( $flag eq "down")
|
|
3783 # print "compoundlines = @compoundlines \n" if $printer == 1;
|
|
3784
|
|
3785 if (scalar (@compoundlines) == 1){
|
|
3786 push(@nonmerged, $line);
|
|
3787
|
|
3788 }
|
|
3789 if (scalar (@compoundlines) > 1){
|
|
3790 # print "FROM CLUSTERER\n" if $printer == 1;
|
|
3791 push(@mergedSet,merge_microsats(@compoundlines, $sequence) );
|
|
3792 }
|
|
3793 } #end foreach my $line (@microstring){
|
|
3794 # print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1;
|
|
3795 #<STDIN> if scalar(@mergedSet) > 0;
|
|
3796 # print "EXIT: find_clusters\n";
|
|
3797 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet));
|
|
3798 }
|
|
3799
|
|
3800 sub custom {
|
|
3801 $a->[$startcord+1] <=> $b->[$startcord+1];
|
|
3802 }
|
|
3803
|
|
3804 sub popOuter {
|
|
3805 # print "\nIN: popOuter @_\n"; <STDIN>;
|
|
3806 my @all = split ("_X0X_",$_[0]);
|
|
3807 # <STDIN> if !defined $_[0];
|
|
3808 my @merged = split ("_X0X_",$_[1]);
|
|
3809 my $sequence = $_[2];
|
|
3810 my $seqlen = length($sequence);
|
|
3811 my %microstart=();
|
|
3812 my %microend=();
|
|
3813 my @mergedSet = ();
|
|
3814 my @nonmerged = ();
|
|
3815
|
|
3816 foreach my $line (@all){
|
|
3817 my @fields = split(/\t/,$line);
|
|
3818 my $start = $seqlen - $fields[$startcord]+ 1;
|
|
3819 my $end = $seqlen - $fields[$endcord] + 1;
|
|
3820 push (@{$microstart{$start}},$line);
|
|
3821 push (@{$microend{$end}},$line);
|
|
3822 }
|
|
3823 my $firstflag = 'down';
|
|
3824 my %forPopouting = ();
|
|
3825
|
|
3826 while( my $line =shift(@merged)){
|
|
3827 # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ;
|
|
3828 chomp $line;
|
|
3829 my @fields = split(/\t/,$line);
|
|
3830 my $start = $fields[$startcord];
|
|
3831 my $end = $fields[$endcord];
|
|
3832 my $startmicro = $line;
|
|
3833 my $endmicro = $line;
|
|
3834
|
|
3835
|
|
3836 delete ($microstart{$start});
|
|
3837 delete ($microend{$end});
|
|
3838 my $flag = 'down';
|
|
3839 my $startflag = 'down';
|
|
3840 my $endflag = 'down';
|
|
3841 my $prestart = $start - $distance;
|
|
3842 my $postend = $end + $distance;
|
|
3843 my @compoundlines = ();
|
|
3844 my %compoundhash = ();
|
|
3845 push (@compoundlines, $line);
|
|
3846 my $startrank = 1;
|
|
3847 my $endrank = 1;
|
|
3848
|
|
3849 # print "\nstart = $start, end = $end\n";
|
|
3850 # <STDIN>;
|
|
3851 for my $i ($start ... $end){
|
|
3852 if(exists $microend{$i}){
|
|
3853 # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n";
|
|
3854 chomp $microend{$i}[0];
|
|
3855 my @fields = split(/\t/,$microend{$i}[0]);
|
|
3856 delete $microstart{$seqlen - $fields[$startcord] + 1};
|
|
3857 my $invertseq = $sequence;
|
|
3858 $invertseq =~ s/ //g;
|
|
3859 push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq ));
|
|
3860 delete $microend{$i};
|
|
3861
|
|
3862 }
|
|
3863
|
|
3864 if(exists $microstart{$i} ){
|
|
3865 # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n";
|
|
3866
|
|
3867 chomp $microstart{$i}[0];
|
|
3868 my @fields = split(/\t/,$microstart{$i}[0]);
|
|
3869 delete $microend{$seqlen - $fields[$endcord] + 1};
|
|
3870 my $invertseq = $sequence;
|
|
3871 $invertseq =~ s/ //g;
|
|
3872 push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) );
|
|
3873 delete $microstart{$i};
|
|
3874 }
|
|
3875 }
|
|
3876
|
|
3877 if (scalar (@compoundlines) == 1){
|
|
3878 push(@mergedSet,join("\t",@compoundlines) );
|
|
3879 }
|
|
3880 else {
|
|
3881 # print "FROM POPOUTER\n" if $printer == 1;
|
|
3882 push(@mergedSet, merge_microsats(@compoundlines, $sequence) );
|
|
3883 }
|
|
3884 }
|
|
3885
|
|
3886 foreach my $key (sort keys %microstart) {
|
|
3887 push(@nonmerged,$microstart{$key}[0]);
|
|
3888 }
|
|
3889
|
|
3890 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) );
|
|
3891 }
|
|
3892
|
|
3893
|
|
3894
|
|
3895 sub invert_justCoordinates{
|
|
3896 my $microsat = $_[0];
|
|
3897 # print "IN invert_justCoordinates ... @_\n" ; <STDIN>;
|
|
3898 chomp $microsat;
|
|
3899 my $seqLength = $_[1];
|
|
3900 my @fields = split(/\t/,$microsat);
|
|
3901 my $start = $seqLength - $fields[$endcord] + 1;
|
|
3902 my $end = $seqLength - $fields[$startcord] + 1;
|
|
3903 $fields[$startcord] = $start;
|
|
3904 $fields[$endcord] = $end;
|
|
3905 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
|
|
3906 # print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1;
|
|
3907 return join("\t",@fields);
|
|
3908 }
|
|
3909
|
|
3910 sub largest_number{
|
|
3911 my $counter = 0;
|
|
3912 my($max) = shift(@_);
|
|
3913 foreach my $temp (@_) {
|
|
3914 #print "finding largest array: $maxcounter \n";
|
|
3915 if($temp > $max){
|
|
3916 $max = $temp;
|
|
3917 }
|
|
3918 }
|
|
3919 return($max);
|
|
3920 }
|
|
3921 sub smallest_number{
|
|
3922 my $counter = 0;
|
|
3923 my($min) = shift(@_);
|
|
3924 foreach my $temp (@_) {
|
|
3925 #print "finding largest array: $maxcounter \n";
|
|
3926 if($temp < $min){
|
|
3927 $min = $temp;
|
|
3928 }
|
|
3929 }
|
|
3930 return($min);
|
|
3931 }
|
|
3932
|
|
3933
|
|
3934 sub filterDuplicates_merged{
|
|
3935 my @merged = @_;
|
|
3936 my %revmerged = ();
|
|
3937 my @fmerged = ();
|
|
3938 foreach my $micro (@merged) {
|
|
3939 my @fields = split(/\t/,$micro);
|
|
3940 if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){
|
|
3941 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
|
|
3942 # print "adding ... \n$key\n$micro\n";
|
|
3943 push(@{$revmerged{$key}}, $micro);
|
|
3944 }
|
|
3945 else{
|
|
3946 # print "pushing.. $micro\n";
|
|
3947 push(@fmerged, $micro);
|
|
3948 }
|
|
3949 }
|
|
3950 # print "\n";
|
|
3951 foreach my $micro (@fmerged) {
|
|
3952 my @fields = split(/\t/,$micro);
|
|
3953 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]);
|
|
3954 # print "searching for key $key\n";
|
|
3955 if (exists $revmerged{$key}){
|
|
3956 # print "deleting $revmerged{$key}[0]\n";
|
|
3957 delete $revmerged{$key};
|
|
3958 }
|
|
3959 }
|
|
3960 foreach my $key (sort keys %revmerged) {
|
|
3961 push(@fmerged,$revmerged{$key}[0]);
|
|
3962 }
|
|
3963 # print "returning ", join("\n", @fmerged),"\n" ;
|
|
3964 return join("__", @fmerged);
|
|
3965 }
|
|
3966
|
|
3967 sub invert_microsat{
|
|
3968 my $micro = $_[0];
|
|
3969 chomp $micro;
|
|
3970 if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;}
|
|
3971 else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; }
|
|
3972 my $sequence = $_[1];
|
|
3973 $sequence =~ s/ //g;
|
|
3974 my $seqlen = length($sequence);
|
|
3975 my @fields = split(/\t/,$micro);
|
|
3976 my $start = $seqlen - $fields[$endcord] +1;
|
|
3977 my $end = $seqlen - $fields[$startcord] +1;
|
|
3978 $fields[$startcord] = $start;
|
|
3979 $fields[$endcord] = $end;
|
|
3980 $fields[$motifcord] = reverse_micro($fields[$motifcord]);
|
|
3981 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]);
|
|
3982 if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){
|
|
3983 my @intertypes = split(/,/,$fields[$interrtypecord]);
|
|
3984 my @inters = split(/,/,$fields[$interrcord]);
|
|
3985 my @interposes = split(/,/,$fields[$interr_poscord]);
|
|
3986 $fields[$interrtypecord] = join(",",reverse(@intertypes));
|
|
3987 $fields[$no_of_interruptionscord] = scalar(@interposes);
|
|
3988 for my $i (0 ... $fields[$no_of_interruptionscord]-1){
|
|
3989 if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){
|
|
3990 $inters[$i] = reverse($inters[$i]);
|
|
3991 $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1;
|
|
3992 }
|
|
3993 else{
|
|
3994 $inters[$i] = "";
|
|
3995 $interposes[$i] = $interposes[$i] - 1;
|
|
3996 }
|
|
3997 $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1;
|
|
3998 }
|
|
3999
|
|
4000 $fields[$interrcord] = join(",",reverse(@inters));
|
|
4001 $fields[$interr_poscord] = join(",",reverse(@interposes));
|
|
4002 }
|
|
4003
|
|
4004 my $finalmicrosat = join("\t", @fields);
|
|
4005 return $finalmicrosat;
|
|
4006
|
|
4007 }
|
|
4008 sub reverse_micro{
|
|
4009 my $micro = reverse($_[0]);
|
|
4010 my @strand = split(/\s*/,$micro);
|
|
4011 for my $i (0 ... $#strand){
|
|
4012 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;}
|
|
4013 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;}
|
|
4014 }
|
|
4015 return join("",@strand);
|
|
4016 }
|
|
4017
|
|
4018 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx
|
|
4019
|
|
4020
|
|
4021 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
|
|
4022
|
|
4023 sub forward_reverse_sputoutput_comparer {
|
|
4024 # print "IN forward_reverse_sputoutput_comparer: @_\n";
|
|
4025 my $input0 = $_[0]; ###### the *nogap_unrand_match file
|
|
4026 my $input1 = $_[1]; ###### the real file, *sput* data
|
|
4027 my $input2 = $_[2]; ###### the reverse file, *sput* data
|
|
4028 my $output1 = $_[3]; ###### microsats different in real file
|
|
4029 my $output2 = $_[4]; ###### microsats missing in real file
|
|
4030 my $output3 = $_[5]; ###### microsats common among real and reverse file
|
|
4031 my $no_of_species = $_[6];
|
|
4032
|
|
4033 $infocord = 2 + (4*$no_of_species) - 1;
|
|
4034 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
4035 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
4036 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
4037 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
4038 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
4039 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
4040 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
4041 $interrtypecord = $motifcord + 1;
|
|
4042 $interrcord = $motifcord + 2;
|
|
4043 $interr_poscord = $motifcord + 3;
|
|
4044 $no_of_interruptionscord = $motifcord + 4;
|
|
4045 $mergestarts = $no_of_interruptionscord+ 1;
|
|
4046 $mergeends = $no_of_interruptionscord+ 2;
|
|
4047 $mergemicros = $no_of_interruptionscord+ 3;
|
|
4048
|
|
4049
|
|
4050 open(SEQ,"<$input0") or die "Cannot open file $input0 $!";
|
|
4051 open(INF,"<$input1") or die "Cannot open file $input1 $!";
|
|
4052 open(INR,"<$input2") or die "Cannot open file $input2 $!";
|
|
4053
|
|
4054 open(DIFF,">$output1") or die "Cannot open file $output1 $!";
|
|
4055 #open(MISS,">$output2") or die "Cannot open file $output2 $!";
|
|
4056 open(SAME,">$output3") or die "Cannot open file $output3 $!";
|
|
4057
|
|
4058
|
|
4059 # print "opened files \n";
|
|
4060 my $linecounter = 0;
|
|
4061 my $fcounter = 0;
|
|
4062 my $rcounter = 0;
|
|
4063
|
|
4064 $printer = 0;
|
|
4065 #---------------------------------------------------------------------------------------------------
|
|
4066 # NOW ADDING FORWARD MICROSATELLITES TO HASH
|
|
4067 my %fmicros = ();
|
|
4068 my $microcounter=0;
|
|
4069 while (my $line = <INF>){
|
|
4070 $linecounter++;
|
|
4071 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
4072 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
|
|
4073 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
4074 push (@{$fmicros{$key}},$line);
|
|
4075 $microcounter++;
|
|
4076 }
|
|
4077 else {
|
|
4078 #print $line;
|
|
4079 }
|
|
4080 }
|
|
4081 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
4082 close INF;
|
|
4083 my @deletedlines = ();
|
|
4084 # print "done forward hash \n";
|
|
4085 $linecounter = 0;
|
|
4086 #---------------------------------------------------------------------------------------------------
|
|
4087 # NOW ADDING REVERSE MICROSATELLITES TO HASH
|
|
4088 my %rmicros = ();
|
|
4089 $microcounter=0;
|
|
4090 while (my $line = <INR>){
|
|
4091 $linecounter++;
|
|
4092 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
4093 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
|
|
4094 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
4095 push (@{$rmicros{$key}},$line);
|
|
4096 $microcounter++;
|
|
4097 }
|
|
4098 else {}
|
|
4099 }
|
|
4100 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n";
|
|
4101 close INR;
|
|
4102 # print "done reverse hash \n";
|
|
4103 $linecounter = 0;
|
|
4104 #---------------------------------------------------------------------------------------------------
|
|
4105 #---------------------------------------------------------------------------------------------------
|
|
4106 # NOW READING THE SEQUENCE FILE
|
|
4107 while(my $sine = <SEQ>){
|
|
4108 my %microstart=();
|
|
4109 my %microend=();
|
|
4110 my @sields = split(/\t/,$sine);
|
|
4111 my $key = ();
|
|
4112 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
4113 $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12);
|
|
4114 }
|
|
4115 else {
|
|
4116 next;
|
|
4117 }
|
|
4118 $printer = 0;
|
|
4119 my $sequence = $sields[$sequencepos];
|
|
4120 chomp $sequence;
|
|
4121 $sequence =~ s/ //g;
|
|
4122 my @localfs = ();
|
|
4123 my @localrs = ();
|
|
4124
|
|
4125 if (exists $fmicros{$key}){
|
|
4126 @localfs = @{$fmicros{$key}};
|
|
4127 delete $fmicros{$key};
|
|
4128 }
|
|
4129
|
|
4130 my %forwardstarts = ();
|
|
4131 my %forwardends = ();
|
|
4132
|
|
4133 foreach my $f (@localfs){
|
|
4134 my @fields = split(/\t/,$f);
|
|
4135 push (@{$forwardstarts{$fields[$startcord]}},$f);
|
|
4136 push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]);
|
|
4137 }
|
|
4138
|
|
4139 if (exists $rmicros{$key}){
|
|
4140 @localrs = @{$rmicros{$key}};
|
|
4141 delete $rmicros{$key};
|
|
4142 }
|
|
4143 else{
|
|
4144 }
|
|
4145
|
|
4146 foreach my $r (@localrs){
|
|
4147 chomp $r;
|
|
4148 my @rields = split(/\t/,$r);
|
|
4149 # print "rields = @rields\n" if $printer == 1;
|
|
4150 my $reciprocalstart = length($sequence) - $rields[$endcord] + 1;
|
|
4151 my $reciprocalend = length($sequence) - $rields[$startcord] + 1;
|
|
4152 # print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1;
|
|
4153 my $microsat = reverse_micro(all_caps($rields[$microsatcord]));
|
|
4154 my @localcollection=();
|
|
4155 for my $i ($reciprocalstart+1 ... $reciprocalend-1){
|
|
4156 if (exists $forwardstarts{$i}){
|
|
4157 push(@localcollection, $forwardstarts{$i}[0] );
|
|
4158 delete $forwardstarts{$i};
|
|
4159 }
|
|
4160 if (exists $forwardends{$i}){
|
|
4161 next if !exists $forwardstarts{$forwardends{$i}[0]};
|
|
4162 push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] );
|
|
4163 }
|
|
4164 }
|
|
4165 if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);}
|
|
4166
|
|
4167 if (scalar(@localcollection) == 0){
|
|
4168 print SAME invert_microsat($r,($sequence) ), "\n";
|
|
4169 }
|
|
4170
|
|
4171 elsif (scalar(@localcollection) == 1){
|
|
4172 # print "f microsat = $localcollection[0]\n" if $printer == 1;
|
|
4173 my @lields = split(/\t/,$localcollection[0]);
|
|
4174 $lields[$microsatcord]=all_caps($lields[$microsatcord]);
|
|
4175 # print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1;
|
|
4176 # print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1;
|
|
4177 if ($microsat eq $lields[$microsatcord]){
|
|
4178 chomp $localcollection[0];
|
|
4179 print SAME $localcollection[0], "\n";
|
|
4180 }
|
|
4181 if ($microsat ne $lields[$microsatcord]){
|
|
4182 chomp $localcollection[0];
|
|
4183 my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence);
|
|
4184 # print "newmicro = $newmicro\n" if $printer == 1;
|
|
4185 if ($newmicro =~ /[a-zA-Z]/){
|
|
4186 print SAME $newmicro,"\n";
|
|
4187 }
|
|
4188 else{
|
|
4189 print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n";
|
|
4190 # print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1;
|
|
4191 # print "@rields\n@lields\n" if $printer == 1;
|
|
4192 }
|
|
4193 }
|
|
4194 }
|
|
4195 else{
|
|
4196 # print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1;
|
|
4197 }
|
|
4198 }
|
|
4199 }
|
|
4200
|
|
4201 close(SEQ);
|
|
4202 close(INF);
|
|
4203 close(INR);
|
|
4204 close(DIFF);
|
|
4205 close(SAME);
|
|
4206
|
|
4207 }
|
|
4208 sub all_caps{
|
|
4209 my @strand = split(/\s*/,$_[0]);
|
|
4210 for my $i (0 ... $#strand){
|
|
4211 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;}
|
|
4212 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;}
|
|
4213 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;}
|
|
4214 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;}
|
|
4215 }
|
|
4216 return join("",@strand);
|
|
4217 }
|
|
4218 sub microsatChooser{
|
|
4219 my $forward = $_[0];
|
|
4220 my $reverse = $_[1];
|
|
4221 my $sequence = $_[2];
|
|
4222 my $seqLength = length($sequence);
|
|
4223 $sequence =~ s/ //g;
|
|
4224 my @fields = split(/\t/,$forward);
|
|
4225 my @rields = split(/\t/,$reverse);
|
|
4226 my $r_start = $seqLength - $rields[$endcord] + 1;
|
|
4227 my $r_end = $seqLength - $rields[$startcord] + 1;
|
|
4228
|
|
4229
|
|
4230 my $f_microsat = $fields[$microsatcord];
|
|
4231 my $r_microsat = $rields[$microsatcord];
|
|
4232
|
|
4233 if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) {
|
|
4234 return $forward if length($f_microsat) >= length($r_microsat);
|
|
4235 return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat);
|
|
4236 }
|
|
4237 return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord];
|
|
4238
|
|
4239 my $f_microsat_copy = $f_microsat;
|
|
4240 my $r_microsat_copy = $r_microsat;
|
|
4241 $f_microsat_copy =~ s/^\[|\]$//g;
|
|
4242 $r_microsat_copy =~ s/^\[|\]$//g;
|
|
4243
|
|
4244 my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
|
|
4245 my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
|
|
4246 my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy);
|
|
4247 my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy);
|
|
4248
|
|
4249 my $f_motif = $fields[$motifcord];
|
|
4250 my $r_motif = $rields[$motifcord];
|
|
4251 my $f_motif_copy = $f_motif;
|
|
4252 my $r_motif_copy = $r_motif;
|
|
4253 $f_motif_copy =~ s/^\[|\]$//g;
|
|
4254 $r_motif_copy =~ s/^\[|\]$//g;
|
|
4255
|
|
4256 my @f_motields = split(/\]\[/,$f_motif_copy);
|
|
4257 my @r_motields = split(/\]\[/,$r_motif_copy);
|
|
4258
|
|
4259 my $f_purestretch = join("",@f_microields);
|
|
4260 my $r_purestretch = join("",@r_microields);
|
|
4261
|
|
4262 if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){
|
|
4263 # print "now.. studying $forward\n$reverse\n" if $printer == 1;
|
|
4264 if ($fields[$typecord] eq $rields[$typecord]){
|
|
4265 # print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
|
|
4266
|
|
4267 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
|
|
4268 my $subset_answer = isSubset($forward, $reverse, $seqLength);
|
|
4269 # print "subset answer = $subset_answer\n" if $printer == 1;
|
|
4270 return $forward if $subset_answer == 1;
|
|
4271 return invert_microsat($reverse, $sequence) if $subset_answer == 2;
|
|
4272 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
|
|
4273 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
|
|
4274 return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch);
|
|
4275 return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch);
|
|
4276 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ;
|
|
4277 }
|
|
4278 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
|
|
4279 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4280 }
|
|
4281 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
|
|
4282 return $forward;
|
|
4283 }
|
|
4284 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
|
|
4285 return invert_microsat($reverse, $sequence);
|
|
4286 }
|
|
4287 }
|
|
4288 else{
|
|
4289 my $fmotlen = ();
|
|
4290 my $rmotlen = ();
|
|
4291 $fmotlen =1 if $fields[$typecord] eq "mononucleotide";
|
|
4292 $fmotlen =2 if $fields[$typecord] eq "dinucleotide";
|
|
4293 $fmotlen =3 if $fields[$typecord] eq "trinucleotide";
|
|
4294 $fmotlen =4 if $fields[$typecord] eq "tetranucleotide";
|
|
4295 $rmotlen =1 if $rields[$typecord] eq "mononucleotide";
|
|
4296 $rmotlen =2 if $rields[$typecord] eq "dinucleotide";
|
|
4297 $rmotlen =3 if $rields[$typecord] eq "trinucleotide";
|
|
4298 $rmotlen =4 if $rields[$typecord] eq "tetranucleotide";
|
|
4299
|
|
4300 if ($fmotlen < $rmotlen){
|
|
4301 if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){
|
|
4302 return $forward;
|
|
4303 }
|
|
4304 else{
|
|
4305 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4306 }
|
|
4307 }
|
|
4308 if ($fmotlen > $rmotlen){
|
|
4309 if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){
|
|
4310 return invert_microsat($reverse, $sequence);
|
|
4311 }
|
|
4312 else{
|
|
4313 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4314 }
|
|
4315 }
|
|
4316 }
|
|
4317 }
|
|
4318 if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){
|
|
4319 # print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1;
|
|
4320 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){
|
|
4321 my $subset_answer = isSubset($forward, $reverse, $seqLength);
|
|
4322 # print "subset answer = $subset_answer\n" if $printer == 1;
|
|
4323 return $forward if $subset_answer == 1;
|
|
4324 return invert_microsat($reverse, $sequence) if $subset_answer == 2;
|
|
4325 # print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1;
|
|
4326 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch);
|
|
4327 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch);
|
|
4328 if ($subset_answer == 3){
|
|
4329 if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){
|
|
4330 if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){
|
|
4331 return $forward;
|
|
4332 }
|
|
4333 else{
|
|
4334 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4335 }
|
|
4336 }
|
|
4337 if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){
|
|
4338 if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){
|
|
4339 return invert_microsat($reverse, $sequence);
|
|
4340 }
|
|
4341 else{
|
|
4342 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4343 }
|
|
4344 }
|
|
4345 }
|
|
4346 }
|
|
4347 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){
|
|
4348 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4349 }
|
|
4350 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){
|
|
4351 return $forward;
|
|
4352 }
|
|
4353 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){
|
|
4354 return invert_microsat($reverse, $sequence);
|
|
4355 }
|
|
4356
|
|
4357 }
|
|
4358
|
|
4359 if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){
|
|
4360 # print "one compound, one nucleotide\n" if $printer == 1;
|
|
4361 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4362 }
|
|
4363 if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){
|
|
4364 # print "one compound, one nucleotide\n" if $printer == 1;
|
|
4365 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence);
|
|
4366 }
|
|
4367 }
|
|
4368
|
|
4369 sub isSubset{
|
|
4370 my $forward = $_[0]; my @fields = split(/\t/,$forward);
|
|
4371 my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
|
|
4372 my $seqLength = $_[2];
|
|
4373 my $r_start = $seqLength - $rields[$endcord] + 1;
|
|
4374 my $r_end = $seqLength - $rields[$startcord] + 1;
|
|
4375 # print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1;
|
|
4376 return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end;
|
|
4377 return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end;
|
|
4378 return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord];
|
|
4379 return "3";
|
|
4380 }
|
|
4381
|
|
4382 sub motifBYmotif_match{
|
|
4383 my $forward = $_[0];
|
|
4384 my $reverse = $_[1];
|
|
4385 $forward =~ s/^\[|\]$//g;
|
|
4386 $reverse =~ s/^\[|\]$//g;
|
|
4387 my @f_motields=split(/\]\[/, $forward);
|
|
4388 my @r_motields=split(/\]\[/, $reverse);
|
|
4389 my $finalresult = 0;
|
|
4390
|
|
4391 if (scalar(@f_motields) != scalar(@r_motields)){
|
|
4392 my $subresult = 0;
|
|
4393 my @mega = (); my @sub = ();
|
|
4394 @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields);
|
|
4395 @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields);
|
|
4396 @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields);
|
|
4397 @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields);
|
|
4398
|
|
4399 for my $i (0 ... $#sub){
|
|
4400 my $probe = $sub[$i].$sub[$i];
|
|
4401 # print "probing $probe and $mega[$i]\n" if $printer == 1;
|
|
4402 if ($probe =~ /$mega[$i]/) {$subresult = 1; }
|
|
4403 else {$subresult = 0; last; }
|
|
4404 }
|
|
4405
|
|
4406 return 0 if $subresult == 0;
|
|
4407 return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f
|
|
4408 return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse
|
|
4409
|
|
4410 }
|
|
4411 else{
|
|
4412 for my $i (0 ... $#f_motields){
|
|
4413 my $probe = $f_motields[$i].$f_motields[$i];
|
|
4414 if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;}
|
|
4415 else {$finalresult = 0 ;last;}
|
|
4416 }
|
|
4417 }
|
|
4418 # print "finalresult = $finalresult\n" if $printer == 1;
|
|
4419 return $finalresult;
|
|
4420 }
|
|
4421
|
|
4422 sub merge_microsats{
|
|
4423 my @input = @_;
|
|
4424 my $sequence = pop(@input);
|
|
4425 $sequence =~ s/ //g;
|
|
4426 my @seq_string = @input;
|
|
4427 # print "IN: merge_microsats\n";
|
|
4428 # print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n";
|
|
4429 my $start;
|
|
4430 my $end;
|
|
4431 my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string;
|
|
4432 # print "\nrearranged into @micros \n";
|
|
4433 my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = ();
|
|
4434 my @fields = ();
|
|
4435 for my $i (0 ... $#micros){
|
|
4436 chomp $micros[$i];
|
|
4437 @fields = split(/\t/,$micros[$i]);
|
|
4438 push(@types, $fields[$typecord]);
|
|
4439 push(@motifs, $fields[$motifcord]);
|
|
4440
|
|
4441 if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);}
|
|
4442 else { push(@interruptiontypes, "NA"); }
|
|
4443 if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);}
|
|
4444 else { push(@interruptions, "NA"); }
|
|
4445 if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);}
|
|
4446 else { push(@interrposes, "NA"); }
|
|
4447 if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);}
|
|
4448 else { push(@no_of_interruptions, "NA"); }
|
|
4449 if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));}
|
|
4450 else { push(@mergestart, $fields[$startcord]); }
|
|
4451 if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));}
|
|
4452 else { push(@mergeend, $fields[$endcord]); }
|
|
4453 if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);}
|
|
4454 else { push(@mergemicro, $fields[$microsatcord]); }
|
|
4455
|
|
4456
|
|
4457 }
|
|
4458 $start = smallest_number(@mergestart);
|
|
4459 $end = largest_number(@mergeend);
|
|
4460 my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]";
|
|
4461 my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro));
|
|
4462 return $microsat;
|
|
4463 }
|
|
4464
|
|
4465 sub slided_microsat{
|
|
4466 my $forward = $_[0]; my @fields = split(/\t/,$forward);
|
|
4467 my $reverse = $_[1]; my @rields = split(/\t/,$reverse);
|
|
4468 my $seqLength = $_[2];
|
|
4469 my $r_start = $seqLength - $rields[$endcord] + 1;
|
|
4470 my $r_end = $seqLength - $rields[$startcord] + 1;
|
|
4471 my $motlen =();
|
|
4472 $motlen =1 if $fields[$typecord] eq "mononucleotide";
|
|
4473 $motlen =2 if $fields[$typecord] eq "dinucleotide";
|
|
4474 $motlen =3 if $fields[$typecord] eq "trinucleotide";
|
|
4475 $motlen =4 if $fields[$typecord] eq "tetranucleotide";
|
|
4476
|
|
4477 if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) {
|
|
4478 return 0;
|
|
4479 }
|
|
4480 else{
|
|
4481 return 1;
|
|
4482 }
|
|
4483
|
|
4484 }
|
|
4485
|
|
4486 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx
|
|
4487
|
|
4488
|
|
4489
|
|
4490 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
|
|
4491 sub new_multispecies_t10{
|
|
4492 my $input1 = $_[0]; #gap_op_unrand_match
|
|
4493 my $input2 = $_[1]; #sput
|
|
4494 my $output = $_[2]; #output
|
|
4495 my $bin = $output."_bin";
|
|
4496 my $orgs = join("|",split(/\./,$_[3]));
|
|
4497 my @organisms = split(/\./,$_[3]);
|
|
4498 my $no_of_species = scalar(@organisms); #3
|
|
4499 my $t10info = $output."_info";
|
|
4500 $prinkter = 0;
|
|
4501
|
|
4502 open (MATCH, "<$input1");
|
|
4503 open (SPUT, "<$input2");
|
|
4504 open (OUT, ">$output");
|
|
4505 open (INFO, ">$t10info");
|
|
4506
|
|
4507
|
|
4508 sub microsat_bracketer;
|
|
4509 sub custom;
|
|
4510 my %seen = ();
|
|
4511 $infocord = 2 + (4*$no_of_species) - 1;
|
|
4512 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
4513 $startcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
4514 $strandcord = 2 + (4*$no_of_species) + 3 - 1;
|
|
4515 $endcord = 2 + (4*$no_of_species) + 4 - 1;
|
|
4516 $microsatcord = 2 + (4*$no_of_species) + 5 - 1;
|
|
4517 $motifcord = 2 + (4*$no_of_species) + 6 - 1;
|
|
4518 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
4519 #---------------------------------------------------------------------------------------------------------------#
|
|
4520 # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES #
|
|
4521 #---------------------------------------------------------------------------------------------------------------#
|
|
4522 my $linecounter = 0;
|
|
4523 my $microcounter = 0;
|
|
4524 while (my $line = <SPUT>){
|
|
4525 chomp $line;
|
|
4526 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n";
|
|
4527 next if $line !~ /[0-9a-z]+/;
|
|
4528 $linecounter++;
|
|
4529 # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
|
|
4530 # print $key, "#-#-#-#-#-#-#-#\n";
|
|
4531 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
4532 my $key = join("\t",$1, $2, $3, $4, $5);
|
|
4533 # print "key = $key\n" if $prinkter == 1;
|
|
4534 push (@{$seen{$key}},$line);
|
|
4535 $microcounter++;
|
|
4536 }
|
|
4537 else {
|
|
4538 #print "could not make ker in SPUT : \n$line \n";
|
|
4539 }
|
|
4540 }
|
|
4541 # print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
|
|
4542 # print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n";
|
|
4543 close SPUT;
|
|
4544
|
|
4545 #----------------------------------------------------------------------------------------------------------------
|
|
4546
|
|
4547 #-------------------------------------------------------------------------------------------------------#
|
|
4548 # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY #
|
|
4549 # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN #
|
|
4550 # SECOND FILE FOLLOWED BY #
|
|
4551 # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION #
|
|
4552 #-------------------------------------------------------------------------------------------------------#
|
|
4553 my $ref = 0;
|
|
4554 my $ref2 = 0;
|
|
4555 my $ref3 = 0;
|
|
4556 my $ref4 = 0;
|
|
4557 my $deletes= 0;
|
|
4558 my $duplicates = 0;
|
|
4559 my $neighbors = 0;
|
|
4560 my $tooshort = 0;
|
|
4561 my $prevmicrol=();
|
|
4562 my $startnotfound = 0;
|
|
4563 my $matchkeysformed = 0;
|
|
4564 my $keysused = 0;
|
|
4565
|
|
4566 while (my $line = <MATCH>) {
|
|
4567 # print colored ['magenta'], $line if $prinkter == 1;
|
|
4568 next if $line !~ /[a-zA-Z0-9]/;
|
|
4569 chomp $line;
|
|
4570 my @fields2 = split(/\t/,$line);
|
|
4571 my $key2 = ();
|
|
4572 # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13);
|
|
4573 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) {
|
|
4574 $matchkeysformed++;
|
|
4575 $key2 = join("\t",$1, $2, $3, $4, $5);
|
|
4576 # print "key = $key2 \n" if $prinkter == 1;
|
|
4577 }
|
|
4578 else{
|
|
4579 # print "could not make ker in SEQ : $line\n";
|
|
4580 next;
|
|
4581 }
|
|
4582 my $sequence = $fields2[$sequencepos];
|
|
4583 $sequence =~ s/\*/-/g;
|
|
4584 my $count = 0;
|
|
4585 if (exists $seen{$key2}){
|
|
4586 $keysused++;
|
|
4587 my @unsorted_raw = @{$seen{$key2}};
|
|
4588 delete $seen{$key2};
|
|
4589 my @sequencearr = split(/\s*/, $sequence);
|
|
4590
|
|
4591 # print "sequencearr = @sequencearr\n" if $prinkter == 1;
|
|
4592
|
|
4593 my $counter;
|
|
4594
|
|
4595 my %start_database = ();
|
|
4596 my %end_database = ();
|
|
4597 foreach my $uns (@unsorted_raw){
|
|
4598 my @uields = split(/\t/,$uns);
|
|
4599 $start_database{$uields[$startcord]} = $uns;
|
|
4600 $end_database{$uields[$endcord]} = $uns;
|
|
4601 }
|
|
4602
|
|
4603 my @unsorted = ();
|
|
4604 my %starts = (); my %ends = ();
|
|
4605 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;}
|
|
4606 for my $u (0 ... $#unsorted_raw){
|
|
4607 my @tields = split(/\t/,$unsorted_raw[$u]);
|
|
4608 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]};
|
|
4609 push(@unsorted, $unsorted_raw[$u]);
|
|
4610 $starts{$tields[$startcord]} = $unsorted_raw[$u];
|
|
4611 # print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1;
|
|
4612 }
|
|
4613
|
|
4614 my $basecounter= 0;
|
|
4615 my $gapcounter = 0;
|
|
4616 my $poscounter = 0;
|
|
4617
|
|
4618 for my $s (@sequencearr){
|
|
4619
|
|
4620 $poscounter++;
|
|
4621 if ($s eq "-"){
|
|
4622 $gapcounter++; next;
|
|
4623 }
|
|
4624 else{
|
|
4625 $basecounter++;
|
|
4626 }
|
|
4627
|
|
4628
|
|
4629 #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
|
|
4630 #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1;
|
|
4631 #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1;
|
|
4632
|
|
4633 if (exists $starts{$basecounter}){
|
|
4634 my $locus = $starts{$basecounter};
|
|
4635 # print "locus identified = $locus\n" if $prinkter == 1;
|
|
4636 my @fields3 = split(/\t/,$locus);
|
|
4637 my $start = $fields3[$startcord];
|
|
4638 my $end = $fields3[$endcord];
|
|
4639 my $motif = $fields3[$motifcord];
|
|
4640 my $microsat = $fields3[$microsatcord];
|
|
4641 my @leftbracketpos = ();
|
|
4642 my @rightbracketpos = ();
|
|
4643 my $bracket_picker = 'no';
|
|
4644 my $leftbrackets=();
|
|
4645 my $rightbrackets = ();
|
|
4646 my $micro_cpy = $microsat;
|
|
4647 # print "microsat = $microsat\n" if $prinkter == 1;
|
|
4648 while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';}
|
|
4649 while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);}
|
|
4650 $microsat =~ s/[\[\]\-\*]//g;
|
|
4651 # print "microsat = $microsat\n" if $prinkter == 1;
|
|
4652 my $human_search = join '-*', split //, $microsat;
|
|
4653 my $temp = substr($sequence, $poscounter-1);
|
|
4654 # print "with poscounter = $poscounter\n" if $prinkter == 1;
|
|
4655 my $search_result = ();
|
|
4656 my $posnow = ();
|
|
4657
|
|
4658 # print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search;
|
|
4659 # <STDIN> if !defined $temp || !defined $human_search;
|
|
4660
|
|
4661 while ($temp =~ /($human_search)/gi){
|
|
4662 $search_result = $1;
|
|
4663 # $posnow = pos($temp);
|
|
4664 last;
|
|
4665 }
|
|
4666
|
|
4667 my @gapspos = ();
|
|
4668 next if !defined $search_result;
|
|
4669
|
|
4670 while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); }
|
|
4671 my $gaps = join("__",@gapspos);
|
|
4672
|
|
4673 my $final_microsat = $search_result;
|
|
4674 if ($bracket_picker eq "yes"){
|
|
4675 $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets);
|
|
4676 }
|
|
4677
|
|
4678 my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat);
|
|
4679
|
|
4680 if ($bracket_picker eq "yes") {
|
|
4681 $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]);
|
|
4682 }
|
|
4683 print OUT $outsentence,"\n";
|
|
4684 }
|
|
4685 }
|
|
4686 }
|
|
4687 }
|
|
4688 my $unusedkeys = scalar(keys %seen);
|
|
4689 # print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
|
|
4690 # print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
|
|
4691 # print INFO "microsatellites too short = $tooshort\n";
|
|
4692 # print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
|
|
4693
|
|
4694 #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n";
|
|
4695 #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n";
|
|
4696 #print "microsatellites too short = $tooshort\n";
|
|
4697 #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n";
|
|
4698 #print "unused keys = \n",join("\n", (keys %seen)),"\n";
|
|
4699 close (MATCH);
|
|
4700 close (SPUT);
|
|
4701 close (OUT);
|
|
4702 close (INFO);
|
|
4703 }
|
|
4704
|
|
4705 sub microsat_bracketer{
|
|
4706 # print "in bracketer: @_\n";
|
|
4707 my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_;
|
|
4708 my @gaps = split(/__/,$gapspos);
|
|
4709 my @lefts = split(/__/,$leftbracketpos);
|
|
4710 my @rights = split(/__/,$rightbracketpos);
|
|
4711 my @new=();
|
|
4712 my $pure = $microsat;
|
|
4713 $pure =~ s/-//g;
|
|
4714 my $off = 0;
|
|
4715 my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights);
|
|
4716 push(@gaps, 0);
|
|
4717 push(@lefts,0);
|
|
4718 push(@rights,0);
|
|
4719
|
|
4720 for my $i (1 ... $finallength){
|
|
4721 # print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n";
|
|
4722 if($rights[0] == $i){
|
|
4723 # print "pushed a ]\n";
|
|
4724 push(@new, "]");
|
|
4725 shift(@rights);
|
|
4726 push(@rights,0);
|
|
4727 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
|
|
4728 next;
|
|
4729 }
|
|
4730 if($gaps[0] == $i){
|
|
4731 # print "pushed a -\n";
|
|
4732 push(@new, "-");
|
|
4733 shift(@gaps);
|
|
4734 push(@gaps, 0);
|
|
4735 for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;}
|
|
4736 for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;}
|
|
4737
|
|
4738 next;
|
|
4739 }
|
|
4740 if($lefts[0] == $i){
|
|
4741 # print "pushed a [\n";
|
|
4742 push(@new, "[");
|
|
4743 shift(@lefts);
|
|
4744 push(@lefts,0);
|
|
4745 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;}
|
|
4746 next;
|
|
4747 }
|
|
4748 else{
|
|
4749 my $pushed = substr($pure,$off,1);
|
|
4750 $off++;
|
|
4751 push(@new,$pushed );
|
|
4752 # print "pushed an alphabet, now new = @new, pushed = $pushed\n";
|
|
4753 next;
|
|
4754 }
|
|
4755 }
|
|
4756 my $returnmicrosat = join("",@new);
|
|
4757 # print "final microsat = $returnmicrosat \n";
|
|
4758 return($returnmicrosat);
|
|
4759 }
|
|
4760
|
|
4761 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx
|
|
4762
|
|
4763
|
|
4764 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
|
|
4765 sub multiSpecies_orthFinder4{
|
|
4766 #print "IN multiSpecies_orthFinder4: @_\n";
|
|
4767 my @handles = ();
|
|
4768 #1 SEPT 30TH 2008
|
|
4769 #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER
|
|
4770 #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL
|
|
4771 #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST
|
|
4772 #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED
|
|
4773 #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY
|
|
4774 #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES.
|
|
4775 #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE
|
|
4776 #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE
|
|
4777 #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN
|
|
4778 #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl
|
|
4779
|
|
4780 #1 trial run:
|
|
4781 #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50
|
|
4782
|
|
4783 $prinkter=0;
|
|
4784
|
|
4785 #############
|
|
4786 my $CLUSTER_DIST = $_[4];
|
|
4787 #############
|
|
4788
|
|
4789
|
|
4790 my $aligns = $_[0];
|
|
4791 my @micros = split(/:/, $_[1]);
|
|
4792 my $orth = $_[2];
|
|
4793 #my $not_orth = "notorth";
|
|
4794 @tags = split(/:/, $_[3]);
|
|
4795
|
|
4796 $no_of_species=scalar(@tags);
|
|
4797 my $junkfile = $orth."_junk";
|
|
4798 #open(JUNK,">$junkfile");
|
|
4799
|
|
4800 #my $info = $output1."_info";
|
|
4801 #print "inputs are : \n"; foreach(@micros){print $_,"\n";}
|
|
4802 #print "info = @_\n";
|
|
4803
|
|
4804
|
|
4805 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
|
|
4806 open (ORTH, ">$orth");
|
|
4807 my $output=$orth."_out";
|
|
4808 open (OUTP, ">$output");
|
|
4809
|
|
4810
|
|
4811 #open (NORTH, ">$not_orth");
|
|
4812 #open (INF, ">$info");
|
|
4813 my $i = 0;
|
|
4814 foreach my $path (@micros){
|
|
4815 $handles[$i] = IO::Handle->new();
|
|
4816 open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!";
|
|
4817 $i++;
|
|
4818 }
|
|
4819
|
|
4820 #print "Opened files\n";
|
|
4821
|
|
4822
|
|
4823 $infocord = 2 + (4*$no_of_species) - 1;
|
|
4824 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
4825 $motifcord = $typecord + 1;
|
|
4826 $gapcord = $motifcord+1;
|
|
4827 $startcord = $gapcord + 1;
|
|
4828 $strandcord = $startcord + 1;
|
|
4829 $endcord = $strandcord + 1;
|
|
4830 $microsatcord = $endcord + 1;
|
|
4831 $sequencepos = 2 + (4*$no_of_species) + 1 -1 ;
|
|
4832 #$sequencepos = 17;
|
|
4833 # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES
|
|
4834 #----------------------------------------------------------------------------------------------------------------
|
|
4835 my @hasharr = ();
|
|
4836 foreach my $path (@micros){
|
|
4837 open(READ, "<$path") or die "Cannot open file $path :$!";
|
|
4838 my %single_hash = ();
|
|
4839 my $key = ();
|
|
4840 my $counter = 0;
|
|
4841 while (my $line = <READ>){
|
|
4842 $counter++;
|
|
4843 # print $line;
|
|
4844 chomp $line;
|
|
4845 my @fields1 = split(/\t/,$line);
|
|
4846 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
4847 $key = join("\t",$1, $2, $4, $5);
|
|
4848
|
|
4849 # print "key = : $key\n" if $prinkter == 1;
|
|
4850
|
|
4851 # print $line if $prinkter == 1;
|
|
4852 push (@{$single_hash{$key}},$line);
|
|
4853 }
|
|
4854 else{
|
|
4855 # print "microsat line incompatible\n";
|
|
4856 }
|
|
4857 }
|
|
4858 push @hasharr, {%single_hash};
|
|
4859 # print "@{$single_hash{$key}} \n";
|
|
4860 # print "done $path: counter = $counter\n" if $prinkter == 1;
|
|
4861 close READ;
|
|
4862 }
|
|
4863 # print "Done hashes\n";
|
|
4864 #----------------------------------------------------------------------------------------------------------------
|
|
4865 my $question=();
|
|
4866 #----------------------------------------------------------------------------------------------------------------
|
|
4867 my @contigstarts = ();
|
|
4868 my @contigends = ();
|
|
4869
|
|
4870 my %contigclusters = ();
|
|
4871 my %contigclustersFirstStartOnly=();
|
|
4872 my %contigclustersLastEndOnly=();
|
|
4873 my %contigclustersLastEndLengthOnly=();
|
|
4874 my %contigclustersFirstStartLengthOnly=();
|
|
4875 my %contigpath=();
|
|
4876 my $dotcounter = 0;
|
|
4877 while (my $line = <BO>){
|
|
4878 # print "x" x 60, "\n" if $prinkter == 1;
|
|
4879 $dotcounter++;
|
|
4880 # print "." if $dotcounter % 100 ==0;
|
|
4881 # print "\n" if $dotcounter % 5000 ==0;
|
|
4882 next if $line !~ /^[0-9]+/;
|
|
4883 # print $line if $prinkter == 1;
|
|
4884 chomp $line;
|
|
4885 my @fields2 = split(/\t/,$line);
|
|
4886 my $key2 = ();
|
|
4887 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
4888 $key2 = join("\t",$1, $2, $4, $5);
|
|
4889 }
|
|
4890 else {
|
|
4891 # print "seq line $line incompatible\n" if $prinkter == 1;
|
|
4892 next;}
|
|
4893
|
|
4894
|
|
4895
|
|
4896
|
|
4897
|
|
4898
|
|
4899 my @sequences = ();
|
|
4900 for (0 ... $#tags){
|
|
4901 my $seq = <BO>;
|
|
4902 # print $seq;
|
|
4903 chomp $seq;
|
|
4904 push(@sequences , " ".$seq);
|
|
4905 }
|
|
4906 my @origsequences = @sequences;
|
|
4907 my $seqcopy = $sequences[0];
|
|
4908 my @strings = ();
|
|
4909 $seqcopy =~ s/[a-zA-Z]|-/x/g;
|
|
4910 my @string = split(/\s*/,$seqcopy);
|
|
4911
|
|
4912 for my $s (0 ... $#tags){
|
|
4913 $sequences[$s] =~ s/-//g;
|
|
4914 $sequences[$s] =~ s/[a-zA-Z]/x/g;
|
|
4915 # print "length of sequence = ",length($sequences[$s]),"\n";
|
|
4916 my @tempstring = split(/\s*/,$sequences[$s]);
|
|
4917 push(@strings, [@tempstring])
|
|
4918
|
|
4919 }
|
|
4920
|
|
4921 my @species_list = ();
|
|
4922 my @micro_count = 0;
|
|
4923 my @starthash = ();
|
|
4924 my $stopper = 1;
|
|
4925 my @endhash = ();
|
|
4926
|
|
4927 my @currentcontigstarts=();
|
|
4928 my @currentcontigends=();
|
|
4929 my @currentcontigchrs=();
|
|
4930
|
|
4931 for my $i (0 ... $#tags){
|
|
4932 # print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1;
|
|
4933 my @temparr = ();
|
|
4934
|
|
4935 if (exists $hasharr[$i]{$key2}){
|
|
4936 @temparr = @{$hasharr[$i]{$key2}};
|
|
4937
|
|
4938 $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/;
|
|
4939 ## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
|
|
4940 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
|
|
4941 my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
|
|
4942 my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
|
|
4943 $contigstarts[$i]{$startkey}= $key2;
|
|
4944 $contigends[$i]{$endkey}= $key2;
|
|
4945 # print "confirming existance: \n" if $prinkter == 1;
|
|
4946 # print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1;
|
|
4947 # print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1;
|
|
4948 $currentcontigchrs[$i]=$1;
|
|
4949 $currentcontigstarts[$i]=$2;
|
|
4950 $currentcontigends[$i]=$3;
|
|
4951
|
|
4952 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
|
|
4953 else {
|
|
4954 push (@starthash, {0 => "0"});
|
|
4955 push (@endhash, {0 => "0"});
|
|
4956 $currentcontigchrs[$i] = 0;
|
|
4957 next;
|
|
4958 }
|
|
4959 $stopper = 0;
|
|
4960 # print "exists: @temparr\n" if $prinkter == 1;
|
|
4961 push(@micro_count, scalar(@temparr));
|
|
4962 push(@species_list, [@temparr]);
|
|
4963 my @tempstart = (); my @tempend = ();
|
|
4964 my %localends = ();
|
|
4965 my %localhash = ();
|
|
4966 # print "---------------------------\n";
|
|
4967
|
|
4968 foreach my $templine (@temparr){
|
|
4969 # print "templine = $templine\n" if $prinkter == 1;
|
|
4970 my @tields = split(/\t/,$templine);
|
|
4971 my $start = $tields[$startcord]; # - $tields[$gapcord];
|
|
4972 my $end = $tields[$endcord]; #- $tields[$gapcord];
|
|
4973 my $realstart = $tields[$startcord]- $tields[$gapcord];
|
|
4974 my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g);
|
|
4975 $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/;
|
|
4976 # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n";
|
|
4977 my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat;
|
|
4978 # print "real start = $realstart, realend = $realend \n";
|
|
4979 for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;}
|
|
4980 push(@tempstart, $start);
|
|
4981 push(@tempend, $end);
|
|
4982 $localhash{$start."-".$end} = $templine;
|
|
4983 }
|
|
4984 push @starthash, {%localhash};
|
|
4985 my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST);
|
|
4986
|
|
4987 # print "foundclusters = $foundclusters\n";
|
|
4988
|
|
4989 my @clusters = split(/_/,$foundclusters);
|
|
4990
|
|
4991 my $clustno = 0;
|
|
4992
|
|
4993 foreach my $cluster (@clusters) {
|
|
4994 my @constituenst = split(/,/,$cluster);
|
|
4995 # print "clusters returned: @constituenst\n" if $prinkter == 1;
|
|
4996 }
|
|
4997
|
|
4998 @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters));
|
|
4999
|
|
5000
|
|
5001 }
|
|
5002 next if $stopper == 1;
|
|
5003
|
|
5004 # print colored ['blue'],"FINAL:\n" if $prinkter == 1;
|
|
5005 my $finalclusters =findClusters(join("!",@string), 1);
|
|
5006 # print "finalclusters = $finalclusters\n";
|
|
5007 # print colored ['blue'],"----------------------\n" if $prinkter == 1;
|
|
5008 my @clusters = split(/,/,$finalclusters);
|
|
5009 # print "@string\n" if $prinkter == 1;
|
|
5010 # print "@clusters\n" if $prinkter == 1;
|
|
5011 # print "------------------------------------------------------------------\n" if $prinkter == 1;
|
|
5012
|
|
5013 my $clustno = 0;
|
|
5014
|
|
5015 # foreach my $cluster (@clusters) {
|
|
5016 # my @constituenst = split(/,/,$cluster);
|
|
5017 # print "clusters returned: @constituenst\n";
|
|
5018 # }
|
|
5019
|
|
5020 next if (scalar @clusters == 0);
|
|
5021
|
|
5022 my @contigcluster=();
|
|
5023 my $clusterno=0;
|
|
5024 my @contigClusterstarts=();
|
|
5025 my @contigClusterends = ();
|
|
5026
|
|
5027 foreach my $clust (@clusters){
|
|
5028 # print "cluster: $clust\n";
|
|
5029 $clusterno++;
|
|
5030 my @localclust = split(/\./, $clust);
|
|
5031 my @result = ();
|
|
5032 my @starts = ();
|
|
5033 my @ends = ();
|
|
5034
|
|
5035 for my $i (0 ... $#localclust){
|
|
5036 # print "localclust[$i]: $localclust[$i]\n";
|
|
5037 my @pattern = split(/:/, $localclust[$i]);
|
|
5038 my @cords = split(/-/, $pattern[1]);
|
|
5039 push (@starts, $cords[0]);
|
|
5040 push (@ends, $cords[1]);
|
|
5041 }
|
|
5042
|
|
5043 my $extremestart = smallest_number(@starts);
|
|
5044 my $extremeend = largest_number(@ends);
|
|
5045 push(@contigClusterstarts, $extremestart);
|
|
5046 push(@contigClusterends, $extremeend);
|
|
5047 # print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ;
|
|
5048
|
|
5049 foreach my $clustparts (@localclust){
|
|
5050 my @pattern = split(/:/, $clustparts);
|
|
5051 # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n";
|
|
5052 push (@result, $starthash[$pattern[0]]{$pattern[1]});
|
|
5053 }
|
|
5054 push(@contigcluster, join("\t", @result));
|
|
5055 # print join("\t", @result),"<-result \n" if $prinkter == 1 ;
|
|
5056 }
|
|
5057
|
|
5058
|
|
5059 my $firstclusterstart = smallest_number(@contigClusterstarts);
|
|
5060 my $lastclusterend = largest_number(@contigClusterends);
|
|
5061
|
|
5062
|
|
5063 $contigclustersFirstStartOnly{$key2}=$firstclusterstart;
|
|
5064 $contigclustersLastEndOnly{$key2} = $lastclusterend;
|
|
5065 $contigclusters{$key2}=[ @contigcluster ];
|
|
5066 # print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1;
|
|
5067 for my $i (0 ... $#tags){
|
|
5068 #1 check if there exists adjacent alignment block wrt coordinates of this species.
|
|
5069 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
|
|
5070 #2 no need to worry about proximity of anything in adjacent block!
|
|
5071
|
|
5072 #1 BELOW, the following is really to calclate the distance between the end coordinate of the
|
|
5073 #2 cluster and the end of the gap-free sequence of each species. this is so that if an
|
|
5074 #3 adjacent alignment block is found lateron, the exact distance between the potentially
|
|
5075 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
|
|
5076 #5 immediately below.
|
|
5077 # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1;
|
|
5078
|
|
5079 my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart);
|
|
5080 my $species_endsubstring = ();
|
|
5081
|
|
5082 if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";}
|
|
5083 else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);}
|
|
5084
|
|
5085 # print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1;
|
|
5086 # print "for species: $tags[$i]: \n" if $prinkter == 1;
|
|
5087
|
|
5088 $species_startsubstring =~ s/-| //g;
|
|
5089 $species_endsubstring =~ s/-| //g;
|
|
5090 $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring);
|
|
5091 $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring);
|
|
5092
|
|
5093
|
|
5094
|
|
5095 # print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1;
|
|
5096 # print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1;
|
|
5097 # print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1;
|
|
5098
|
|
5099 # print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1;
|
|
5100
|
|
5101 }
|
|
5102
|
|
5103
|
|
5104 }
|
|
5105 # print "\ndone the job of filling... \n";
|
|
5106 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5107 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5108 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5109 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5110 $prinkter=0;
|
|
5111 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!";
|
|
5112
|
|
5113 my %clusteringpaths=();
|
|
5114 my %clustersholder=();
|
|
5115 my %foundkeys=();
|
|
5116 my %clusteringpathsRev=();
|
|
5117
|
|
5118
|
|
5119 my $totalcount=();
|
|
5120 my $founkeys_enteredcount=();
|
|
5121 my $transfered=0;
|
|
5122 my $complete_transfered=0;
|
|
5123 my $plain_transfered=0;
|
|
5124 my $existing_removed=0;
|
|
5125
|
|
5126 while (my $line = <BO>){
|
|
5127 # print "x" x 60, "\n" if $prinkter == 1;
|
|
5128 next if $line !~ /^[0-9]+/;
|
|
5129 #print $line;
|
|
5130 chomp $line;
|
|
5131 my @fields2 = split(/\t/,$line);
|
|
5132 my $key2 = ();
|
|
5133 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) {
|
|
5134 $key2 = join("\t",$1, $2, $4, $5);
|
|
5135 }
|
|
5136
|
|
5137 else {
|
|
5138 # print "seq line $line incompatible\n";
|
|
5139 next;
|
|
5140 }
|
|
5141 # print "KEY = : $key2\n" if $prinkter == 1;
|
|
5142
|
|
5143
|
|
5144 my @currentcontigstarts=();
|
|
5145 my @currentcontigends=();
|
|
5146 my @currentcontigchrs=();
|
|
5147 my @clusters = ();
|
|
5148 my @clusterscopy=();
|
|
5149 if (exists $contigclusters{$key2}){
|
|
5150 @clusters = @{$contigclusters{$key2}};
|
|
5151 @clusterscopy=@clusters;
|
|
5152 for my $i (0 ... $#tags){
|
|
5153 # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1;
|
|
5154 if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){
|
|
5155 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1;
|
|
5156 my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1;
|
|
5157 my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1;
|
|
5158 $currentcontigchrs[$i]=$1;
|
|
5159 $currentcontigstarts[$i]=$2;
|
|
5160 $currentcontigends[$i]=$3;
|
|
5161 }
|
|
5162 else {
|
|
5163 $currentcontigchrs[$i] = 0;
|
|
5164 # print "no microsat clusters for $key2\n" if $prinkter == 1; next;
|
|
5165 }
|
|
5166 }
|
|
5167 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"}
|
|
5168
|
|
5169 my @sequences = ();
|
|
5170 for (0 ... $#tags){
|
|
5171 my $seq = <BO>;
|
|
5172 # print $seq;
|
|
5173 chomp $seq;
|
|
5174 push(@sequences , " ".$seq);
|
|
5175 }
|
|
5176
|
|
5177 next if scalar @currentcontigchrs == 0;
|
|
5178
|
|
5179 # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1;
|
|
5180 my %visitedcontigs=();
|
|
5181
|
|
5182 for my $i (0 ... $#tags){
|
|
5183 #1 check if there exists adjacent alignment block wrt coordinates of this species.
|
|
5184 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block..
|
|
5185 #2 no need to worry about proximity of anything in adjacent block!
|
|
5186 @clusters=@clusterscopy;
|
|
5187 #1 BELOW, the following is really to calclate the distance between the end coordinate of the
|
|
5188 #2 cluster and the end of the gap-free sequence of each species. this is so that if an
|
|
5189 #3 adjacent alignment block is found lateron, the exact distance between the potentially
|
|
5190 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used
|
|
5191 #5 immediately below.
|
|
5192 my $firstclusterstart = $contigclustersFirstStartOnly{$key2};
|
|
5193 my $lastclusterend = $contigclustersLastEndOnly{$key2};
|
|
5194
|
|
5195 my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]);
|
|
5196 # print "check if exists $key3 in contigends for $i\n" if $prinkter == 1;
|
|
5197
|
|
5198 if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){
|
|
5199 $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies#
|
|
5200 # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1;
|
|
5201
|
|
5202 #1 extract coordinates of the last cluster of this found alignment block
|
|
5203 # print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1;
|
|
5204 # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1;
|
|
5205 # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
|
|
5206 # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1;
|
|
5207 my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}};
|
|
5208 my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}};
|
|
5209 my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=();
|
|
5210
|
|
5211 for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){
|
|
5212 # print "val = $val\n" if $prinkter == 1;
|
|
5213 if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){
|
|
5214 push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/;
|
|
5215 }
|
|
5216 }
|
|
5217 # print "-----\n" if $prinkter == 1;
|
|
5218 for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){
|
|
5219 # print "val = $val\n" if $prinkter == 1;
|
|
5220 if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){
|
|
5221 push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/;
|
|
5222 }
|
|
5223 }
|
|
5224
|
|
5225
|
|
5226 # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1;
|
|
5227 # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1;
|
|
5228
|
|
5229 # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){
|
|
5230 if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){
|
|
5231 my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}};
|
|
5232 $regurgitate[$#regurgitate]=~s/\n//g;
|
|
5233 $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters);
|
|
5234 delete $contigclusters{$contigends[$i]{$key3}};
|
|
5235 $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ];
|
|
5236 delete $contigclusters{$key2};
|
|
5237 $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0;
|
|
5238 $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0;
|
|
5239
|
|
5240 if (scalar(@clusters) < 1){
|
|
5241 # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2};
|
|
5242 $clusteringpaths{$key2}=$contigends[$i]{$key3};
|
|
5243 $clusteringpathsRev{$contigends[$i]{$key3}}=$key2;
|
|
5244 print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n";
|
|
5245 # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n";
|
|
5246 $founkeys_enteredcount-- if exists $foundkeys{$key2};
|
|
5247 $existing_removed++ if exists $foundkeys{$key2};
|
|
5248 # print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1;
|
|
5249 delete $foundkeys{$key2} if exists $foundkeys{$key2};
|
|
5250 $complete_transfered++;
|
|
5251 }
|
|
5252 else{
|
|
5253 print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2};
|
|
5254 $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2};
|
|
5255 $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2};
|
|
5256
|
|
5257 $founkeys_enteredcount++ if !exists $foundkeys{$key2};
|
|
5258 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2};
|
|
5259 # print "adding foundkeys entry $foundkeys{$key2}\n";
|
|
5260 $transfered++;
|
|
5261 }
|
|
5262 #$contigclusters{$key2}=[ @contigcluster ];
|
|
5263 }
|
|
5264 }
|
|
5265 else{
|
|
5266 # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1;
|
|
5267 $plain_transfered++;
|
|
5268 print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
|
|
5269 $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2};
|
|
5270 $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2};
|
|
5271 $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
|
|
5272 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2};
|
|
5273 # print "adding foundkeys entry $foundkeys{$key2}\n";
|
|
5274
|
|
5275 }
|
|
5276 $totalcount++;
|
|
5277
|
|
5278 }
|
|
5279
|
|
5280
|
|
5281 }
|
|
5282 close BO;
|
|
5283 #close (NORTH);
|
|
5284 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5285 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5286 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5287 #///////////////////////////////////////////////////////////////////////////////////////
|
|
5288
|
|
5289 my $founkeys_count=();
|
|
5290 my $nopath_count=();
|
|
5291 my $pathed_count=0;
|
|
5292 foreach my $key2 (keys %foundkeys){
|
|
5293 #print "x" x 60, "\n";
|
|
5294 # print "x" if $dotcounter % 100 ==0;
|
|
5295 # print "\n" if $dotcounter % 5000 ==0;
|
|
5296 $founkeys_count++;
|
|
5297 my $key = $key2;
|
|
5298 # print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1;
|
|
5299 if ($clusteringpaths{$key} eq $key){
|
|
5300 # print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1;
|
|
5301 $nopath_count++;
|
|
5302 delete $foundkeys{$key2};
|
|
5303 print ORTH join ("\n",@{$contigclusters{$key2}}),"\n";
|
|
5304 }
|
|
5305 else{
|
|
5306 my @pool=();
|
|
5307 my $key3=();
|
|
5308 $pathed_count++;
|
|
5309 # print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1;
|
|
5310 # print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1;
|
|
5311 if ($clusteringpathsRev{$key} eq "0") {
|
|
5312 next;
|
|
5313 }
|
|
5314 else{
|
|
5315 my $yek3 = $clusteringpathsRev{$key};
|
|
5316 my $yek = $key;
|
|
5317 # print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1;
|
|
5318 while ($yek3 ne "0"){
|
|
5319 # print "$yek->$yek3," if $prinkter == 1;
|
|
5320 $yek = $yek3;
|
|
5321 $yek3 = $clusteringpathsRev{$yek};
|
|
5322 }
|
|
5323 # print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1;
|
|
5324 $key3 = $clusteringpaths{$yek};
|
|
5325 $key = $yek;
|
|
5326 }
|
|
5327
|
|
5328 # print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1;
|
|
5329
|
|
5330 while($key ne $key3){
|
|
5331 # print "KEEY $key->$key3\n" if $prinkter == 1;
|
|
5332 # print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1;
|
|
5333
|
|
5334 if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}};
|
|
5335 # print "now pool = @pool\n" if $prinkter == 1;
|
|
5336 }
|
|
5337 delete $foundkeys{$key3};
|
|
5338 $key=$key3;
|
|
5339 $key3=$clusteringpaths{$key};
|
|
5340 }
|
|
5341 # print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1;
|
|
5342 my @firstcontig= @{$contigclusters{$key}};
|
|
5343 delete $foundkeys{$key2} if exists $foundkeys{$key2} ;
|
|
5344 delete $foundkeys{$key} if exists $foundkeys{$key};
|
|
5345
|
|
5346 unshift @pool, pop @firstcontig;
|
|
5347 # print join("\t",@pool),"\n" if $prinkter == 1;
|
|
5348 print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0;
|
|
5349 print ORTH join ("\t",@pool),"\n";
|
|
5350 # join();
|
|
5351 }
|
|
5352
|
|
5353 }
|
|
5354 #close (NORTH);
|
|
5355 # print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1;
|
|
5356 close (BO);
|
|
5357 close (ORTH);
|
|
5358 close (OUTP);
|
|
5359 return 1;
|
|
5360
|
|
5361 }
|
|
5362 sub stringPainter{
|
|
5363 my @string = split(/_C0C_/,$_[0]);
|
|
5364 # print $_[0], " <- in stringPainter\n";
|
|
5365 # print $_[1], " <- in clusters\n";
|
|
5366
|
|
5367 my @clusters = split(/,/, $_[1]);
|
|
5368 for my $i (0 ... $#clusters){
|
|
5369 my $cluster = $clusters[$i];
|
|
5370 # print "cluster = $cluster\n";
|
|
5371 my @parts = split(/\./,$cluster);
|
|
5372 my @cord = split(/:|-/,shift(@parts));
|
|
5373 my $minstart = $cord[1];
|
|
5374 my $maxend = $cord[2];
|
|
5375 # print "minstart = $minstart , maxend = $maxend\n";
|
|
5376
|
|
5377 for my $j (0 ... $#parts){
|
|
5378 # print "oing thri $parts[$j]\n";
|
|
5379 my @cord = split(/:|-/,$parts[$j]);
|
|
5380 $minstart = $cord[1] if $cord[1] < $minstart;
|
|
5381 $maxend = $cord[2] if $cord[2] > $maxend;
|
|
5382 }
|
|
5383 # print "minstart = $minstart , maxend = $maxend\n";
|
|
5384 for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;}
|
|
5385
|
|
5386
|
|
5387 }
|
|
5388 # print "@string <-done from function stringPainter\n";
|
|
5389 return join("_S0S_",@string);
|
|
5390 }
|
|
5391
|
|
5392 sub findClusters{
|
|
5393 my $continue = 0;
|
|
5394 my @mapped_clusters = ();
|
|
5395 my $clusterdist = $_[1];
|
|
5396 my $previous = 'x';
|
|
5397 my @localcluster = ();
|
|
5398 my $cluster_starts = ();
|
|
5399 my $cluster_ends = ();
|
|
5400 my $localcluster_start = ();
|
|
5401 my $localcluster_end = ();
|
|
5402 my @record_cluster = ();
|
|
5403 my @string = split(/\!/, $_[0]);
|
|
5404 my $zerolength=0;
|
|
5405
|
|
5406 for my $pos_pos (1 ... $#string){
|
|
5407 my $pos = $string[$pos_pos];
|
|
5408 # print $pos, "\n";
|
|
5409 if ($continue == 0 && $pos eq "x") {next;}
|
|
5410
|
|
5411 if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){
|
|
5412 if ($zerolength == 0) {$localcluster_end = $pos_pos-1};
|
|
5413 $zerolength++;
|
|
5414 $continue = 1;
|
|
5415 }
|
|
5416
|
|
5417 if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) {
|
|
5418 $zerolength = 0;
|
|
5419 $continue = 0;
|
|
5420 my %seen;
|
|
5421 my @uniqed = grep !$seen{$_}++, @localcluster;
|
|
5422 # print "caught cluster : @uniqed \n";
|
|
5423 push(@mapped_clusters, [@uniqed]);
|
|
5424 # print "clustered:\n@uniqed\n";
|
|
5425 @localcluster = ();
|
|
5426 @record_cluster = ();
|
|
5427
|
|
5428 }
|
|
5429
|
|
5430 if ($pos ne "x"){
|
|
5431 $zerolength = 0;
|
|
5432 $continue = 1;
|
|
5433 $pos =~ s/x,//g;
|
|
5434 my @entries = split(/,/,$pos);
|
|
5435 $localcluster_end = 0;
|
|
5436 $localcluster_start = 0;
|
|
5437 push(@record_cluster,$pos);
|
|
5438
|
|
5439 if ($continue == 0){
|
|
5440 @localcluster = ();
|
|
5441 @localcluster = (@localcluster, @entries);
|
|
5442 $localcluster_start = $pos_pos;
|
|
5443 }
|
|
5444
|
|
5445 if ($continue == 1 ) {
|
|
5446 @localcluster = (@localcluster, @entries);
|
|
5447 }
|
|
5448 }
|
|
5449 }
|
|
5450
|
|
5451 if (scalar(@localcluster) > 0){
|
|
5452 my %seen;
|
|
5453 my @uniqed = grep !$seen{$_}++, @localcluster;
|
|
5454 # print "caught cluster : @uniqed \n";
|
|
5455 push(@mapped_clusters, [@uniqed]);
|
|
5456 # print "clustered:\n@uniqed\n";
|
|
5457 @localcluster = ();
|
|
5458 @record_cluster = ();
|
|
5459 }
|
|
5460
|
|
5461 my @returner = ();
|
|
5462
|
|
5463 foreach my $clust (@mapped_clusters){
|
|
5464 my @localclust = @$clust;
|
|
5465 my @result = ();
|
|
5466 foreach my $clustparts (@localclust){
|
|
5467 push(@result,$clustparts);
|
|
5468 }
|
|
5469 push(@returner , join(".",@result));
|
|
5470 }
|
|
5471 # print "returnig: ", join(",",@returner), "\n";
|
|
5472 return join(",",@returner);
|
|
5473 }
|
|
5474 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx
|
|
5475
|
|
5476 #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
|
5477
|
|
5478 sub MakeTrees{
|
|
5479 my $tree = $_[0];
|
|
5480 my @parts=($tree);
|
|
5481 # my @parts=();
|
|
5482
|
|
5483 while (1){
|
|
5484 $tree =~ s/^\(//g;
|
|
5485 $tree =~ s/\)$//g;
|
|
5486 my @arr = ();
|
|
5487
|
|
5488 if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){
|
|
5489 @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/;
|
|
5490 $tree = $2;
|
|
5491 push @parts, $tree;
|
|
5492 }
|
|
5493 elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){
|
|
5494 @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/;
|
|
5495 $tree = $1;
|
|
5496 push @parts, $tree;
|
|
5497 }
|
|
5498 elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){
|
|
5499 last;
|
|
5500 }
|
|
5501 }
|
|
5502 return @parts;
|
|
5503 }
|
|
5504
|
|
5505 #xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
|
5506
|
|
5507 sub qualityFilter{
|
|
5508 my $unmaskedorthfile = $_[0];
|
|
5509 my $seqfile = $_[1];
|
|
5510 my $maskedorthfile = $_[2];
|
|
5511 my $filteredout = $maskedorthfile."_residue";
|
|
5512 open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!";
|
|
5513
|
|
5514 my %keyhash = ();
|
|
5515
|
|
5516 while (my $line = <PMORTH>){
|
|
5517 my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/;
|
|
5518 push @{$keyhash{$key}}, $line;
|
|
5519 }
|
|
5520
|
|
5521 open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!";
|
|
5522 open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!";
|
|
5523 open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!";
|
|
5524
|
|
5525
|
|
5526
|
|
5527 while (my $line = <SEQ>){
|
|
5528 chomp $line;
|
|
5529 if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){
|
|
5530 my $key = join("\t", $1,$2,$3,$4);
|
|
5531 next if !exists $keyhash{$key};
|
|
5532 my @orths = @{$keyhash{$key}} if exists $keyhash{$key};
|
|
5533 delete $keyhash{$key};
|
|
5534
|
|
5535 my $sine = <SEQ>;
|
|
5536
|
|
5537 foreach my $orth (@orths){
|
|
5538 #print "-----------------------------------------------------------------\n";
|
|
5539 #print $orth;
|
|
5540 my $orthcopy = $orth;
|
|
5541 $orth =~ s/^>//;
|
|
5542 my @parts = split(/>/,$orth);
|
|
5543
|
|
5544 my @starts = ();
|
|
5545 my @ends = ();
|
|
5546
|
|
5547 foreach my $part (@parts){
|
|
5548 my $no_of_species = adjustCoordinates($part);
|
|
5549 my @pields = split(/\t/,$part);
|
|
5550
|
|
5551 # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n";
|
|
5552
|
|
5553 push @starts, $pields[$startcord];
|
|
5554 push @ends, $pields[$endcord];
|
|
5555 }
|
|
5556
|
|
5557 #print "starts = @starts ... ends = @ends\n";
|
|
5558
|
|
5559 my $leftend = smallest_number(@starts)-10;
|
|
5560 my $rightend = largest_number(@ends)+10;
|
|
5561
|
|
5562 my $maskarea = substr($sine, $leftend, $rightend-$leftend+1);
|
|
5563 print RES $orth if $maskarea =~ /#/;
|
|
5564
|
|
5565
|
|
5566 next if $maskarea =~ /#/;
|
|
5567
|
|
5568 print MORTH $orthcopy;
|
|
5569 }
|
|
5570 }
|
|
5571 else{
|
|
5572 next;
|
|
5573 }
|
|
5574
|
|
5575
|
|
5576 }
|
|
5577
|
|
5578 # print "UNDONE: ", scalar(keys %keyhash),"\n";
|
|
5579 # print MORTH "UNDONE: ", scalar(keys %keyhash),"\n";
|
|
5580
|
|
5581 }
|
|
5582
|
|
5583 sub adjustCoordinates{
|
|
5584 my $line = $_[0];
|
|
5585 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig;
|
|
5586 my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g);
|
|
5587 # print "line = $line\n";
|
|
5588 $infocord = 2 + (4*$no_of_species) - 1;
|
|
5589 $typecord = 2 + (4*$no_of_species) + 1 - 1;
|
|
5590 $motifcord = 2 + (4*$no_of_species) + 2 - 1;
|
|
5591 $gapcord = $motifcord+1;
|
|
5592 $startcord = $gapcord+1;
|
|
5593 $strandcord = $startcord+1;
|
|
5594 $endcord = $strandcord + 1;
|
|
5595 $microsatcord = $endcord + 1;
|
|
5596 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ;
|
|
5597 $interr_poscord = $microsatcord + 3;
|
|
5598 $no_of_interruptionscord = $microsatcord + 4;
|
|
5599 $interrcord = $microsatcord + 2;
|
|
5600 # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i;
|
|
5601 return $no_of_species;
|
|
5602 }
|
|
5603
|
|
5604
|
|
5605
|
|
5606
|