Mercurial > repos > devteam > multispecies_orthologous_microsats
comparison multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl @ 0:275433d3a395 draft
Uploaded tool tarball.
author | devteam |
---|---|
date | Wed, 25 Sep 2013 11:26:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:275433d3a395 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Term::ANSIColor; | |
5 use File::Basename; | |
6 use IO::Handle; | |
7 use Cwd; | |
8 use File::Path; | |
9 use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter); | |
10 use File::Path qw(make_path remove_tree); | |
11 use File::Temp qw/ tempfile tempdir /; | |
12 my $tdir = tempdir( CLEANUP => 1 ); | |
13 chdir $tdir; | |
14 my $dir = getcwd; | |
15 #print "dir = $dir\n"; | |
16 | |
17 #$ENV{'PATH'} .= ':' . dirname($0); | |
18 my $date = `date`; | |
19 | |
20 my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV; | |
21 if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; } | |
22 | |
23 $tree_definition_all =~ s/\s+//g; | |
24 $threshold_array =~ s/\s+//g; | |
25 $allspeciesin =~ s/\s+//g; | |
26 #------------------------------------------------------------------------------- | |
27 # WHICH SPUTNIK USED? | |
28 my $sputnikpath = (); | |
29 $sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ; | |
30 #$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC"; | |
31 #print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath; | |
32 #exit if !-e $sputnikpath; | |
33 #$sputnikpath = "bx-sputnik" ; | |
34 #print "ARGV input = @ARGV\n"; | |
35 #print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n"; | |
36 #------------------------------------------------------------------------------- | |
37 # RUNFILE | |
38 #------------------------------------------------------------------------------- | |
39 $distance = 1; #bp | |
40 $distance++; | |
41 my @tree_definitions=MakeTrees($tree_definition_all); | |
42 my $allspeciesset = $tree_definition_all; | |
43 $allspeciesset =~ s/[\(\) ]+//g; | |
44 @allspecies = split(/,/,$allspeciesset); | |
45 | |
46 my @outputfiles = (); | |
47 my $round = 0; | |
48 #my $tdir = tempdir( CLEANUP => 0 ); | |
49 #chdir $tdir; | |
50 | |
51 foreach my $tree_definition (@tree_definitions){ | |
52 my @commas = ($tree_definition =~ /,/g) ; | |
53 #print "commas = @commas\n"; <STDIN>; | |
54 next if scalar(@commas) <= 1; | |
55 #print "species_set = $species_set\n"; | |
56 $treeSpeciesNum = scalar(@commas) + 1; | |
57 $species_set = $tree_definition; | |
58 $species_set =~ s/[\)\( ;]+//g; | |
59 #print "species_set = $species_set\n"; <STDIN>; | |
60 | |
61 $round++; | |
62 #------------------------------------------------------------------------------- | |
63 # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP) | |
64 $threshold_array=~ s/,/_/g; | |
65 my @thresharr = split("_",$threshold_array); | |
66 @thresholds=@thresharr; | |
67 #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold)); | |
68 #print "current dit=$dir\n"; | |
69 #------------------------------------------------------------------------------- | |
70 # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY | |
71 my @chrfiles=(); | |
72 | |
73 #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0]; | |
74 my $chromt=int(rand(10000)); | |
75 my $p_chr=$chromt; | |
76 | |
77 | |
78 my @exactspeciesset_unarranged = split(/,/,$species_set); | |
79 $tree_definition=~s/[\)\(, ]/\t/g; | |
80 my @treespecies=split(/\t+/,$tree_definition); | |
81 my @exactspecies=(); | |
82 | |
83 foreach my $spec (@treespecies){ | |
84 foreach my $espec (@exactspeciesset_unarranged){ | |
85 push @exactspecies, $spec if $spec eq $espec; | |
86 } | |
87 } | |
88 #print "exactspecies=@exactspecies\n"; | |
89 $focalspec = $exactspecies[0]; | |
90 my $arranged_species_set=join(".",@exactspecies); | |
91 my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); | |
92 my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt"); | |
93 #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; | |
94 maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set); | |
95 maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set); | |
96 #print "done maf to axt conversion\n"; | |
97 my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt"); | |
98 artificial_axdata_inverter ($chr_name, $reverse_chr_name); | |
99 #print "reverse_chr_name=$reverse_chr_name\n"; | |
100 #------------------------------------------------------------------------------- | |
101 # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT | |
102 foreach my $direct ("reverse_direction","forward_direction"){ | |
103 $p_chr=$chromt; | |
104 #print "direction = $direct\n"; | |
105 $p_chr = $p_chr."r" if $direct eq "reverse_direction"; | |
106 $p_chr = $p_chr if $direct eq "forward_direction"; | |
107 my $config = $species_set; | |
108 $config=~s/,/./g; | |
109 my @orgs = split(/\./,$arranged_species_set); | |
110 #print "ORGS= @orgs\n"; | |
111 my @tag=@orgs; | |
112 | |
113 | |
114 my $tags = join(",", @tag); | |
115 my @tags=@tag; | |
116 chomp $p_chr; | |
117 $tags = join("_", split(/,/, $tags)); | |
118 my $pchr = "chr".$p_chr; | |
119 | |
120 my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array; | |
121 my @sp_tags = (); | |
122 | |
123 # print "$ptag _ orthfile\n"; <STDIN>; | |
124 #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n"; | |
125 foreach my $sp (@tag){ | |
126 push(@sp_tags, ($sp.".".$ptag)); | |
127 } | |
128 | |
129 my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]); | |
130 my @presp_tags = (); | |
131 | |
132 foreach my $sp (@tag){ | |
133 push(@presp_tags, ($sp.".".$preptag)); | |
134 } | |
135 | |
136 my $resultdir = ""; | |
137 my $orthdir = ""; | |
138 my $filtereddir = ""; | |
139 my $pipedir = ""; | |
140 | |
141 my @title_queries = (); | |
142 push(@title_queries, "^[0-9]+"); | |
143 my $sep="\\s"; | |
144 for my $or (0 ... $#orgs){ | |
145 my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]")); | |
146 #$title =~ s/chr\\+\\s+\+/chr/g; | |
147 push(@title_queries, $title); | |
148 } | |
149 my $title_query = join($sep, @title_queries); | |
150 #print "title_queries=@title_queries\n"; | |
151 #print "query = >$title_query<\n"; | |
152 #print "orgs = @orgs\n"; | |
153 #------------------------------------------------------------------------------- | |
154 # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES | |
155 my $t1input = $pchr.".".$arranged_species_set.".net.axt"; | |
156 | |
157 my @t1outputs = (); | |
158 | |
159 foreach my $sp (@presp_tags){ | |
160 push(@t1outputs, $sp."_gap_op"); | |
161 } | |
162 | |
163 | |
164 | |
165 multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query); | |
166 #print "t1outputs=@t1outputs\n"; | |
167 #print "done t1\n"; <STDIN>; | |
168 #------------------------------------------------------------------------------- | |
169 #START T2.PL | |
170 | |
171 my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = (); | |
172 | |
173 for my $t (0 ... scalar(@tags)-1){ | |
174 multi_species_t2($t1outputs[$t], $tag[$t]); | |
175 } | |
176 #------------------------------------------------------------------------------- | |
177 #START T2.2.PL | |
178 | |
179 my @temp_tags = @tag; | |
180 | |
181 foreach my $sp (@presp_tags){ | |
182 my $t2input = $sp."_nogap_op_unrand"; | |
183 multi_species_t2_2($t2input, shift(@temp_tags)); | |
184 } | |
185 undef (@temp_tags); | |
186 | |
187 #------------------------------------------------------------------------------- | |
188 #START SPUTNIK | |
189 | |
190 my @jobIDs = (); | |
191 @temp_tags = @tag; | |
192 my @sput_filelist = (); | |
193 | |
194 foreach my $sp (@presp_tags){ | |
195 #print "sp = $sp\n"; | |
196 my $sputnikoutput = $pipedir.$sp."_sput_op0"; | |
197 my $sputnikinput = $pipedir.$sp."_nogap_op_unrand"; | |
198 push(@sput_filelist, $sputnikinput); | |
199 my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput; | |
200 # print "$sputnikcommand\n"; | |
201 my @sputnikcommand_system = $sputnikcommand; | |
202 system(@sputnikcommand_system); | |
203 } | |
204 | |
205 #------------------------------------------------------------------------------- | |
206 #START SPUTNIK OUTPUT CORRECTOR | |
207 | |
208 foreach my $sp (@presp_tags){ | |
209 my $corroutput = $pipedir.$sp."_sput_op1"; | |
210 my $corrinput = $pipedir.$sp."_sput_op0"; | |
211 sputnikoutput_corrector($corrinput,$corroutput); | |
212 | |
213 my $t4output = $pipedir.$sp."_sput_op2"; | |
214 multi_species_t4($corroutput,$t4output); | |
215 | |
216 my $t5output = $pipedir.$sp."_sput_op3"; | |
217 multi_species_t5($t4output,$t5output); | |
218 #print "done t5.pl for $sp\n"; | |
219 | |
220 my $t6output = $pipedir.$sp."_sput_op4"; | |
221 multi_species_t6($t5output,$t6output,scalar(@orgs)); | |
222 } | |
223 #------------------------------------------------------------------------------- | |
224 #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING | |
225 | |
226 foreach my $sp (@presp_tags){ | |
227 my $t9output = $pipedir.$sp."_gap_op_unrand_match"; | |
228 my $t9sequence = $pipedir.$sp."_gap_op_unrand2"; | |
229 my $t9micro = $pipedir.$sp."_sput_op4"; | |
230 t9($t9micro,$t9sequence,$t9output); | |
231 | |
232 my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match"; | |
233 my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2"; | |
234 t9($t9micro,$t9sequence2,$t9output2); | |
235 } | |
236 #print "done both t9.pl for all orgs\n"; | |
237 | |
238 #------------------------------------------------------------------------------- | |
239 # FIND COMPOUND MICROSATELLITES | |
240 | |
241 @jobIDs = (); | |
242 my $species_counter = 0; | |
243 | |
244 foreach my $sp (@presp_tags){ | |
245 my $simple_microsats=$pipedir.$sp."_sput_op4_simple"; | |
246 my $compound_microsats=$pipedir.$sp."_sput_op4_compound"; | |
247 my $input_micro = $pipedir.$sp."_sput_op4"; | |
248 my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match"; | |
249 multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array ); | |
250 $species_counter++; | |
251 } | |
252 | |
253 #------------------------------------------------------------------------------- | |
254 # READING AND FILTERING SIMPLE MICROSATELLITES | |
255 my $spcounter2=0; | |
256 foreach my $sp (@sp_tags){ | |
257 my $presp = $presp_tags[$spcounter2]; | |
258 $spcounter2++; | |
259 my $simple_microsats=$pipedir.$presp."_sput_op4_simple"; | |
260 my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered"; | |
261 my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue"; | |
262 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); | |
263 } | |
264 | |
265 #------------------------------------------------------------------------------- | |
266 # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS | |
267 | |
268 $species_counter = 0; | |
269 foreach my $sp (@sp_tags){ | |
270 my $presp = $presp_tags[$species_counter]; | |
271 my $compound_microsats = $pipedir.$presp."_sput_op4_compound"; | |
272 my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted"; | |
273 my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure"; | |
274 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; | |
275 multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags)); | |
276 $species_counter++; | |
277 } | |
278 #------------------------------------------------------------------------------- | |
279 # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM.. | |
280 $species_counter = 0; | |
281 | |
282 foreach my $sp (@sp_tags){ | |
283 my $presp = $presp_tags[$species_counter]; | |
284 my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure"; | |
285 my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; | |
286 my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound"; | |
287 my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; | |
288 multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8"); | |
289 $species_counter++; | |
290 } | |
291 #------------------------------------------------------------------------------- | |
292 # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES | |
293 $species_counter = 0; | |
294 | |
295 foreach my $sp (@sp_tags){ | |
296 my $presp = $presp_tags[$species_counter]; | |
297 | |
298 my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; | |
299 my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; | |
300 my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue"; | |
301 multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); | |
302 | |
303 my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted"; | |
304 my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; | |
305 my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue"; | |
306 multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags)); | |
307 | |
308 my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound"; | |
309 my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered"; | |
310 my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue"; | |
311 multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); | |
312 $species_counter++; | |
313 } | |
314 #print "done filtering both simple and compound microsatellites \n"; | |
315 | |
316 #------------------------------------------------------------------------------- | |
317 | |
318 my @combinedarray = (); | |
319 my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide"); | |
320 my @combinedarray_tags = ("mono", "di", "tri", "tetra"); | |
321 $species_counter = 0; | |
322 | |
323 foreach my $sp (@sp_tags){ | |
324 my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple"; | |
325 push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra"; | |
326 $species_counter++; | |
327 } | |
328 | |
329 #------------------------------------------------------------------------------- | |
330 # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION | |
331 my $sp_counter = 0; | |
332 foreach my $sp (@sp_tags){ | |
333 my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; | |
334 my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; | |
335 my $simple = $pipedir.$sp."_sput_op4_simple_filtered"; | |
336 my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple"; | |
337 `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`; | |
338 for my $i (0 ... 3){ | |
339 `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`; | |
340 } | |
341 $sp_counter++; | |
342 } | |
343 #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n"; | |
344 | |
345 #------------------------------------------------------------------------------- | |
346 # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES | |
347 # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES | |
348 @temp_tags = @sp_tags; | |
349 my $increment = 1000000; | |
350 my @splist = (); | |
351 my $targetdir = $pipedir; | |
352 $species_counter=0; | |
353 | |
354 foreach my $sp (@sp_tags){ | |
355 my $presp = $presp_tags[$species_counter]; | |
356 $species_counter++; | |
357 my $localtag = shift @temp_tags; | |
358 my $locallist = $targetdir.$localtag."_".$p_chr."_list"; | |
359 push(@splist, $locallist); | |
360 my $input = $pipedir.$presp."_nogap_op_unrand2_match"; | |
361 chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr); | |
362 } | |
363 | |
364 | |
365 my @unionarray = (); | |
366 #print "splist=@splist\n"; | |
367 #------------------------------------------------------------------------------- | |
368 # FIND INTERRUPTED MICROSATELLITES | |
369 | |
370 $species_counter = 0; | |
371 | |
372 for my $i (0 .. $#combinedarray){ | |
373 | |
374 @jobIDs = (); | |
375 open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!"; | |
376 | |
377 while (my $sp1 = <JLIST1>){ | |
378 #print "$splist[$i]: sp1=$sp1\n"; | |
379 chomp $sp1; | |
380 | |
381 for my $j (0 ... $#combinedarray_tags){ | |
382 my $interr = $sp1."_interr_".$combinedarray_tags[$j]; | |
383 my $simple = $sp1."_simple_".$combinedarray_tags[$j]; | |
384 push @{$unionarray[$i]}, $interr, $simple; | |
385 multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8"); | |
386 } | |
387 } | |
388 $species_counter++; | |
389 } | |
390 close JLIST1; | |
391 #------------------------------------------------------------------------------- | |
392 # REUNION AND ZIPPING BEFORE T10.PL | |
393 | |
394 my @allarray = (); | |
395 | |
396 for my $i (0 ... $#sp_tags){ | |
397 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; | |
398 unlink $localfile if -e $localfile; | |
399 push(@allarray, $localfile); | |
400 | |
401 my $unfiltered_localfile= $localfile."_unfiltered"; | |
402 my $residue_localfile= $localfile."_residue"; | |
403 | |
404 unlink $unfiltered_localfile; | |
405 #unlink $unfiltered_localfile; | |
406 for my $j (0 ... $#{$unionarray[$i]}){ | |
407 #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n"; | |
408 `cat $unionarray[$i][$j] >> $unfiltered_localfile`; | |
409 unlink $unionarray[$i][$j]; | |
410 } | |
411 | |
412 multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) ); | |
413 my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered"; | |
414 my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue"; | |
415 my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue"; | |
416 | |
417 `cat $analyzed_compound >> $localfile`; | |
418 } | |
419 #------------------------------------------------------------------------------- | |
420 # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS | |
421 | |
422 my $toescape=0; | |
423 | |
424 | |
425 for my $i (0 ... $#sp_tags){ | |
426 my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; | |
427 $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./; | |
428 my $direction = $1; | |
429 #print "localfile = $localfile , direction = $direction\n"; | |
430 # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1; | |
431 $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/; | |
432 last if $direction =~ /chr[0-9a-zA-Z]+r/; | |
433 my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match"; | |
434 my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match"; | |
435 my $reverselocal = $localfile; | |
436 $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g; | |
437 merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags)); | |
438 #------------------------------------------------------------------------------- | |
439 my $forward_separate = $localfile."_separate"; | |
440 my $reverse_separate = $reverselocal."_separate"; | |
441 my $diff = $forward_separate."_diff"; | |
442 my $miss = $forward_separate."_miss"; | |
443 my $common = $forward_separate."_common"; | |
444 forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags)); | |
445 #------------------------------------------------------------------------------- | |
446 my $symmetrical_file = $localfile."_symmetrical"; | |
447 my $merged_file = $localfile."_merged"; | |
448 #print "cating: $merged_file $common into -> $symmetrical_file \n"; | |
449 `cat $merged_file $common > $symmetrical_file`; | |
450 #------------------------------------------------------------------------------- | |
451 my $t10output = $symmetrical_file."_fin_hit_all_2"; | |
452 new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs)); | |
453 #------------------------------------------------------------------------------- | |
454 } | |
455 next if $toescape == 1; | |
456 #------------------------------------------------------------------------------------------------ | |
457 # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES | |
458 | |
459 | |
460 my @micros_array = (); | |
461 my $sampletag = (); | |
462 for my $i (0 ... $#sp_tags){ | |
463 my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2"; | |
464 push(@micros_array, $finhitFile); | |
465 $sampletag = $sp_tags[$i]; | |
466 } | |
467 #$sampletag =~ s/^([A-Z]+\.)/ORTH_/; | |
468 #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp"; | |
469 my $orthfiletemp = $ptag."_orthfile"; | |
470 my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation); | |
471 | |
472 my $maskedorthfiletemp = $ptag."_orthfile_masked"; | |
473 qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp); | |
474 | |
475 push @outputfiles , $maskedorthfiletemp; | |
476 } | |
477 $date = `date`; | |
478 } | |
479 | |
480 `cat @outputfiles > $orthfile`; | |
481 | |
482 my $rootdir = $dir; | |
483 $rootdir =~ s/\/[A-Za-z0-9\-_]+$//; | |
484 chdir $rootdir; | |
485 remove_tree($dir); | |
486 | |
487 #print "date = $date\n"; | |
488 #remove_tree($tdir); | |
489 #------------------------------------------------------------------------------------------------ | |
490 #------------------------------------------------------------------------------------------------ | |
491 #------------------------------------------------------------------------------------------------ | |
492 #------------------------------------------------------------------------------------------------ | |
493 | |
494 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx | |
495 | |
496 sub maftoAxt_multispecies { | |
497 #print "in maftoAxt_multispecies : got @_\n"; | |
498 my $fname=$_[0]; | |
499 open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; | |
500 my $treedefinition = $_[1]; | |
501 open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; | |
502 my $counter = 0; | |
503 my $exactspeciesset = $_[3]; | |
504 my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); | |
505 | |
506 $treedefinition=~s/[\)\(, ]/\t/g; | |
507 my @species=split(/\t+/,$treedefinition); | |
508 my @exactspecies=(); | |
509 | |
510 foreach my $spec (@species){ | |
511 foreach my $espec (@exactspeciesset_unarranged){ | |
512 push @exactspecies, $spec if $spec eq $espec; | |
513 } | |
514 } | |
515 #print "exactspecies=@exactspecies\n"; | |
516 | |
517 ########### | |
518 my $select = 2; | |
519 #select = 1 if all species need sequences to be present for each block otherwise, it is 0 | |
520 #select = 2 only the allowed set make up the alignment. use the removeset | |
521 # information to detect alignmenets that have other important genomes aligned. | |
522 ########### | |
523 my @allowedset = (); | |
524 @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; | |
525 @allowedset = join("_",0,@species) if $select == 1; | |
526 #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; | |
527 @allowedset = join("_",0,@exactspecies) if $select == 2; | |
528 #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; | |
529 | |
530 my $start = 0; | |
531 my @sequences = (); | |
532 my @titles = (); | |
533 my $species_counter = "0"; | |
534 my $countermatch = 0; | |
535 my $outsideSpecies=0; | |
536 | |
537 while(my $line = <IN>){ | |
538 # print $line; | |
539 next if $line =~ /^#/; | |
540 next if $line =~ /^i/; | |
541 chomp $line; | |
542 my @fields = split(/\s+/,$line); | |
543 chomp $line; | |
544 if ($line =~ /^a /){ | |
545 $start = 1; | |
546 } | |
547 | |
548 if ($line =~ /^s /){ | |
549 | |
550 foreach my $sp (@allspecies){ | |
551 # print "checking species $sp\n"; | |
552 if ($fields[1] =~ /$sp/){ | |
553 $species_counter = $species_counter."_".$sp; | |
554 push(@sequences, $fields[6]); | |
555 my @sp_info = split(/\./,$fields[1]); | |
556 my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); | |
557 push(@titles, $title); | |
558 # print "species_counter = $species_counter\n"; | |
559 } | |
560 } | |
561 } | |
562 | |
563 if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ | |
564 # print "species_counter = $species_counter\n"; | |
565 my $arranged = reorderSpecies($species_counter, @allspecies); | |
566 my $stopper = 1; | |
567 my $arrno = 0; | |
568 | |
569 # print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n"; | |
570 if (scalar(@sequences) == scalar(@exactspecies)){ | |
571 foreach my $set (@allowedset){ | |
572 # print "testing $arranged against $set\n"; | |
573 if ($arranged eq $set){ | |
574 $stopper = 0; last; | |
575 } | |
576 $arrno++; | |
577 } | |
578 } | |
579 else{ | |
580 $stopper = 1; | |
581 } | |
582 | |
583 | |
584 if ($stopper == 0) { | |
585 @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; | |
586 @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; | |
587 my $filteredseq = filter_gaps(@sequences); | |
588 | |
589 if ($filteredseq ne "SHORT"){ | |
590 #print "printing"; <STDIN>; | |
591 $counter++; | |
592 print OUT join (" ",$counter, @titles), "\n"; | |
593 print OUT $filteredseq, "\n"; | |
594 print OUT "\n"; | |
595 $countermatch++; | |
596 } | |
597 } | |
598 else{ #print "nexting\n";<STDIN>; | |
599 } | |
600 | |
601 @sequences = (); @titles = (); $start = 0;$species_counter = "0"; | |
602 next; | |
603 | |
604 } | |
605 } | |
606 # print "countermatch = $countermatch\n"; | |
607 } | |
608 | |
609 sub reorderSpecies{ | |
610 my @inarr=@_; | |
611 my $currSpecies = shift (@inarr); | |
612 my $ordered_species = 0; | |
613 my @species=@inarr; | |
614 #print "species = @species\n"; | |
615 foreach my $order (@species){ | |
616 $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; | |
617 } | |
618 return $ordered_species; | |
619 | |
620 } | |
621 | |
622 sub filter_gaps{ | |
623 my @sequences = @_; | |
624 # print "sequences sent are @sequences\n"; | |
625 my $seq_length = length($sequences[0]); | |
626 my $seq_no = scalar(@sequences); | |
627 my $allgaps = (); | |
628 for (1 ... $seq_no){ | |
629 $allgaps = $allgaps."-"; | |
630 } | |
631 | |
632 my @seq_array = (); | |
633 my $seq_counter = 0; | |
634 foreach my $seq (@sequences){ | |
635 # my @sequence = split(/\s*/,$seq); | |
636 $seq_array[$seq_counter] = [split(/\s*/,$seq)]; | |
637 # push @seq_array, [@sequence]; | |
638 $seq_counter++; | |
639 } | |
640 my $g = 0; | |
641 while ( $g < $seq_length){ | |
642 last if (!exists $seq_array[0][$g]); | |
643 my $bases = (); | |
644 for my $u (0 ... $#seq_array){ | |
645 $bases = $bases.$seq_array[$u][$g]; | |
646 } | |
647 # print $bases, "\n"; | |
648 if ($bases eq $allgaps){ | |
649 # print "bases are $bases, position is $g \n"; | |
650 for my $seq (@seq_array){ | |
651 splice(@$seq , $g, 1); | |
652 } | |
653 } | |
654 else { | |
655 $g++; | |
656 } | |
657 } | |
658 | |
659 my @outs = (); | |
660 | |
661 foreach my $seq (@seq_array){ | |
662 push(@outs, join("",@$seq)); | |
663 } | |
664 return "SHORT" if length($outs[0]) <=100; | |
665 return (join("\n", @outs)); | |
666 } | |
667 | |
668 | |
669 sub allowedSetOfSpecies{ | |
670 my @allowed_species = split(/_/,$_[0]); | |
671 unshift @allowed_species, 0; | |
672 # print "allowed set = @allowed_species \n"; | |
673 my @output = (); | |
674 for (0 ... scalar(@allowed_species) - 4){ | |
675 push(@output, join("_",@allowed_species)); | |
676 pop @allowed_species; | |
677 } | |
678 return join(";",reverse(@output)); | |
679 | |
680 } | |
681 | |
682 | |
683 sub orderInfo{ | |
684 my @info = split(/;/,$_[0]); | |
685 # print "info = @info"; | |
686 my @old = split(/_/,$_[1]); | |
687 my @new = split(/_/,$_[2]); | |
688 shift @old; shift @new; | |
689 my @outinfo = (); | |
690 foreach my $spe (@new){ | |
691 for my $no (0 ... $#old){ | |
692 if ($spe eq $old[$no]){ | |
693 push(@outinfo, $info[$no]); | |
694 } | |
695 } | |
696 } | |
697 # print "outinfo = @outinfo \n"; | |
698 return join(";", @outinfo); | |
699 } | |
700 | |
701 #xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx | |
702 | |
703 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx | |
704 sub artificial_axdata_inverter{ | |
705 open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!"; | |
706 open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!"; | |
707 my $linecounter=0; | |
708 while (my $line = <IN>){ | |
709 $linecounter++; | |
710 #print "$linecounter\n"; | |
711 chomp $line; | |
712 my $final_line = $line; | |
713 my $trycounter = 0; | |
714 if ($line =~ /^[a-zA-Z\-]/){ | |
715 # while ($final_line eq $line){ | |
716 my @fields = split(/\s*/,$line); | |
717 | |
718 $final_line = join("",reverse(@fields)); | |
719 # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; | |
720 # $trycounter++; | |
721 # print "trying again....$trycounter : $final_line\n" if $final_line eq $line; | |
722 # } | |
723 } | |
724 | |
725 # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; | |
726 if ($line =~ /^[0-9]/){ | |
727 $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g; | |
728 $final_line = $line; | |
729 } | |
730 print OUT $final_line,"\n"; | |
731 #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; | |
732 } | |
733 close OUT; | |
734 } | |
735 #xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx | |
736 | |
737 | |
738 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx | |
739 | |
740 sub multi_species_t1 { | |
741 | |
742 my $input1 = $_[0]; | |
743 #print "@_\n"; <STDIN>; | |
744 my @tags = split(/_/, $_[1]); | |
745 my @outputs = split(/,/, $_[2]); | |
746 my $title_query = $_[3]; | |
747 my @handles = (); | |
748 | |
749 open(FILEB,"<$input1")or die "Cannot open file: $input1 $!"; | |
750 my $i = 0; | |
751 foreach my $path (@outputs){ | |
752 $handles[$i] = IO::Handle->new(); | |
753 open ($handles[$i], ">$path") or die "Can't open $path : $!"; | |
754 $i++; | |
755 } | |
756 | |
757 my $curdef; | |
758 my $start = 0; | |
759 | |
760 while (my $line = <FILEB> ) { | |
761 if ($line =~ /^\d/){ | |
762 $line =~ s/ +/\t/g; | |
763 my @fields = split(/\s+/, $line); | |
764 if (($line =~ /$title_query/)){ | |
765 my $title = $line; | |
766 my $counter = 0; | |
767 foreach my $tag (@tags){ | |
768 $line = <FILEB>; | |
769 print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line; | |
770 $counter++; | |
771 } | |
772 } | |
773 else{ | |
774 foreach my $tag (@tags){ | |
775 my $tine = <FILEB>; | |
776 } | |
777 } | |
778 | |
779 } | |
780 } | |
781 | |
782 foreach my $hand (@handles){ | |
783 $hand->close(); | |
784 } | |
785 | |
786 close FILEB; | |
787 } | |
788 | |
789 #xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx | |
790 | |
791 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx | |
792 | |
793 sub multi_species_t2{ | |
794 | |
795 my $input = $_[0]; | |
796 my $species = $_[1]; | |
797 my $output1 = $input."_unr"; | |
798 | |
799 #------------------------------------------------------------------------------------------ | |
800 open (FILEF1, "<$input") or die "Cannot open file $input :$!"; | |
801 open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!"; | |
802 | |
803 my $line1 = <FILEF1>; | |
804 | |
805 while($line1){ | |
806 { | |
807 # chomp($line); | |
808 if ($line1 =~ (m/^\>$species/)){ | |
809 chomp($line1); | |
810 print FILEF2 $line1; | |
811 $line1 = <FILEF1>; | |
812 chomp($line1); | |
813 print FILEF2 "\t", $line1,"\n"; | |
814 } | |
815 } | |
816 $line1 = <FILEF1>; | |
817 } | |
818 | |
819 close FILEF1; | |
820 close FILEF2; | |
821 #------------------------------------------------------------------------------------------ | |
822 | |
823 my $output2 = $output1."and"; | |
824 my $output3 = $output1."and2"; | |
825 open(IN,"<$output1"); | |
826 open (FILEF3, ">$output2"); | |
827 open (FILEF4, ">$output3"); | |
828 | |
829 | |
830 while (<IN>){ | |
831 my $line = $_; | |
832 chomp($line); | |
833 my @fields=split (/\t/, $line); | |
834 # print $line,"\n"; <STDIN>; | |
835 if($line !~ /random/){ | |
836 print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n"; | |
837 print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n"; | |
838 } | |
839 } | |
840 | |
841 | |
842 close IN; | |
843 close FILEF3; | |
844 close FILEF4; | |
845 unlink $output1; | |
846 | |
847 #------------------------------------------------------------------------------------------ | |
848 # OLD T3.PL RUDIMENT | |
849 | |
850 my $t3output = $output2; | |
851 $t3output =~ s/gap_op_unrand/nogap_op_unrand/g; | |
852 | |
853 open(IN,"<$output2"); | |
854 open(OUTA,">$t3output"); | |
855 | |
856 | |
857 while (<IN>){ | |
858 s/-//g unless /^>/; | |
859 print OUTA; | |
860 } | |
861 | |
862 close IN; | |
863 close OUTA; | |
864 #------------------------------------------------------------------------------------------ | |
865 } | |
866 #xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx | |
867 | |
868 | |
869 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx | |
870 sub multi_species_t2_2{ | |
871 #print "IN multi_species_t2_2 : @_\n"; | |
872 my $input = $_[0]; | |
873 my $species = $_[1]; | |
874 my $output1 = $input."2"; | |
875 | |
876 | |
877 open (FILEF1, "<$input"); | |
878 open (FILEF2, ">$output1"); | |
879 | |
880 my $line1 = <FILEF1>; | |
881 | |
882 while($line1){ | |
883 { | |
884 # chomp($line); | |
885 if ($line1 =~ (m/^\>$species/)){ | |
886 chomp($line1); | |
887 print FILEF2 $line1; | |
888 $line1 = <FILEF1>; | |
889 chomp($line1); | |
890 print FILEF2 "\t", $line1,"\n"; | |
891 } | |
892 } | |
893 $line1 = <FILEF1>; | |
894 } | |
895 | |
896 close FILEF1; | |
897 close FILEF2; | |
898 } | |
899 | |
900 #xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx | |
901 | |
902 | |
903 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx | |
904 sub sputnikoutput_corrector{ | |
905 my $input = $_[0]; | |
906 my $output = $_[1]; | |
907 open(IN,"<$input") or die "Cannot open file $input :$!"; | |
908 open(OUT,">$output") or die "Cannot open file $output :$!"; | |
909 my $tine; | |
910 while (my $line=<IN>){ | |
911 if($line =~/length /){ | |
912 $tine = $line; | |
913 $tine =~ s/\s+/\t/g; | |
914 my @fields = split(/\t/,$tine); | |
915 if ($fields[6] > 60){ | |
916 print OUT $line; | |
917 $line = <IN>; | |
918 | |
919 while (($line !~ /nucleotide/) && ($line !~ /^>/)){ | |
920 chomp $line; | |
921 print OUT $line; | |
922 $line = <IN>; | |
923 } | |
924 print OUT "\n"; | |
925 print OUT $line; | |
926 } | |
927 else{ | |
928 print OUT $line; | |
929 } | |
930 } | |
931 else{ | |
932 print OUT $line; | |
933 } | |
934 } | |
935 close IN; | |
936 close OUT; | |
937 } | |
938 #xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx | |
939 | |
940 | |
941 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx | |
942 sub multi_species_t4{ | |
943 # print "multi_species_t4 : @_\n"; | |
944 my $input = $_[0]; | |
945 my $output = $_[1]; | |
946 open (FILEA, "<$input"); | |
947 open (FILEB, ">$output"); | |
948 | |
949 my $line = <FILEA>; | |
950 | |
951 while ($line) { | |
952 # chomp $line; | |
953 if ($line =~ />/) { | |
954 chomp $line; | |
955 print FILEB $line, "\n"; | |
956 } | |
957 | |
958 | |
959 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ | |
960 chomp $line; | |
961 print FILEB $line, " " ; | |
962 $line = <FILEA>; | |
963 chomp $line; | |
964 print FILEB $line,"\n"; | |
965 } | |
966 | |
967 $line = <FILEA>; | |
968 } | |
969 | |
970 | |
971 close FILEA; | |
972 close FILEB; | |
973 | |
974 } | |
975 | |
976 #xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx | |
977 | |
978 | |
979 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx | |
980 sub multi_species_t5{ | |
981 | |
982 my $input = $_[0]; | |
983 my $output = $_[1]; | |
984 | |
985 open(FILEB,"<$input"); | |
986 open(FILEC,">$output"); | |
987 | |
988 my $curdef; | |
989 | |
990 while (my $line = <FILEB> ) { | |
991 | |
992 if ($line =~ /^>/){ | |
993 chomp $line; | |
994 $curdef = $line; | |
995 next; | |
996 } | |
997 | |
998 if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ | |
999 print FILEC $curdef," ",$line; | |
1000 } | |
1001 | |
1002 } | |
1003 | |
1004 | |
1005 close FILEB; | |
1006 close FILEC; | |
1007 | |
1008 } | |
1009 #xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx | |
1010 | |
1011 | |
1012 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx | |
1013 sub multi_species_t6{ | |
1014 my $input = $_[0]; | |
1015 my $output = $_[1]; | |
1016 my $focalstrand=$_[3]; | |
1017 # print "inpput = @_\n"; | |
1018 open (FILE, "<$input"); | |
1019 open (FILE_MICRO, ">$output"); | |
1020 my $linecounter=0; | |
1021 while (my $line = <FILE>){ | |
1022 $linecounter++; | |
1023 chomp $line; | |
1024 #print "line = $line\n"; | |
1025 #MONO# | |
1026 $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/; | |
1027 my $strand=$1; | |
1028 my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g); | |
1029 #print "line = $line\n"; | |
1030 my $specfieldsend = 2 + ($no_of_species*4) - 1; | |
1031 my @fields = split(/\s+/, $line); | |
1032 my @speciesdata = @fields[0 ... $specfieldsend]; | |
1033 $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/; | |
1034 my ($tide, $start, $end) = ($1, $2, $3); | |
1035 #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n"; | |
1036 if($line =~ /mononucleotide/){ | |
1037 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n"; | |
1038 } | |
1039 #DI# | |
1040 elsif($line =~ /dinucleotide/){ | |
1041 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n"; | |
1042 } | |
1043 #TRI# | |
1044 elsif($line =~ /trinucleotide/ ){ | |
1045 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n"; | |
1046 } | |
1047 #TETRA# | |
1048 elsif($line =~ /tetranucleotide/){ | |
1049 print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n"; | |
1050 } | |
1051 #PENTA# | |
1052 elsif($line =~ /pentanucleotide/){ | |
1053 #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n"; | |
1054 } | |
1055 else{ | |
1056 # print "not: @fields\n"; | |
1057 } | |
1058 } | |
1059 # print "linecounter=$linecounter\n"; | |
1060 close FILE; | |
1061 close FILE_MICRO; | |
1062 } | |
1063 | |
1064 sub mono { | |
1065 my $st = $_[0]; | |
1066 my $tp = unpack "A1"x(length($st)/1),$st; | |
1067 my $var1 = substr($tp, 0, 1); | |
1068 return join ("\t", $var1); | |
1069 } | |
1070 sub di { | |
1071 my $st = $_[0]; | |
1072 my $tp = unpack "A2"x(length($st)/2),$st; | |
1073 my $var1 = substr($tp, 0, 2); | |
1074 return join ("\t", $var1); | |
1075 } | |
1076 sub tri { | |
1077 my $st = $_[0]; | |
1078 my $tp = unpack "A3"x(length($st)/3),$st; | |
1079 my $var1 = substr($tp, 0, 3); | |
1080 return join ("\t", $var1); | |
1081 } | |
1082 sub tetra { | |
1083 my $st = $_[0]; | |
1084 my $tp = unpack "A4"x(length($st)/4),$st; | |
1085 my $var1 = substr($tp, 0, 4); | |
1086 return join ("\t", $var1); | |
1087 } | |
1088 sub penta { | |
1089 my $st = $_[0]; | |
1090 my $tp = unpack "A5"x(length($st)/5),$st; | |
1091 my $var1 = substr($tp, 0, 5); | |
1092 return join ("\t", $var1); | |
1093 } | |
1094 | |
1095 #xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx | |
1096 | |
1097 | |
1098 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx | |
1099 sub t9{ | |
1100 my $input1 = $_[0]; | |
1101 my $input2 = $_[1]; | |
1102 my $output = $_[2]; | |
1103 | |
1104 | |
1105 open(IN1,"<$input1") if -e $input1; | |
1106 open(IN2,"<$input2") or die "cannot open file $_[1] : $!"; | |
1107 open(OUT,">$output") or die "cannot open file $_[2] : $!"; | |
1108 | |
1109 | |
1110 my %seen = (); | |
1111 my $prevkey = 0; | |
1112 | |
1113 if (-e $input1){ | |
1114 while (my $line = <IN1>){ | |
1115 chomp($line); | |
1116 my @fields = split(/\t/,$line); | |
1117 my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]); | |
1118 # print "key in t9 = $key1\n"; | |
1119 $seen{$key1}++ if ($prevkey ne $key1) ; | |
1120 $prevkey = $key1; | |
1121 } | |
1122 # print "done first hash\n"; | |
1123 close IN1; | |
1124 } | |
1125 | |
1126 while (my $line = <IN2>){ | |
1127 # print $line, "**\n"; | |
1128 if (-e $input1){ | |
1129 chomp($line); | |
1130 my @fields = split(/\t/,$line); | |
1131 my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]); | |
1132 if (exists $seen{$key2}){ | |
1133 print OUT "$line\n" ; | |
1134 delete $seen{$key2}; | |
1135 } | |
1136 } | |
1137 else { | |
1138 print OUT "$line\n" ; | |
1139 # print "$line\n" ; | |
1140 } | |
1141 } | |
1142 | |
1143 close IN2; | |
1144 close OUT; | |
1145 } | |
1146 #xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx | |
1147 | |
1148 | |
1149 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx | |
1150 | |
1151 | |
1152 sub multiSpecies_compound_microsat_hunter3{ | |
1153 | |
1154 my $input1 = $_[0]; ###### the *_sput_op4_ii file | |
1155 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" | |
1156 my $output1 = $_[2]; ###### plain microsatellite file | |
1157 my $output2 = $_[3]; ###### compound microsatellite file | |
1158 my $org = $_[4]; ###### 1 or 2 | |
1159 $no_of_species = $_[5]; | |
1160 #print "IN multiSpecies_compound_microsat_hunter3: @_\n"; | |
1161 #my @tags = split(/\t/,$info); | |
1162 sub compoundify; | |
1163 open(IN,"<$input1") or die "Cannot open file $input1 $!"; | |
1164 open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; | |
1165 open(OUT,">$output1") or die "Cannot open file $output1 $!"; | |
1166 open(OUT2,">$output2") or die "Cannot open file $output2 $!"; | |
1167 $infocord = 2 + (4*$no_of_species) - 1; | |
1168 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
1169 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
1170 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
1171 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
1172 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
1173 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
1174 | |
1175 my @thresholds = ("0"); | |
1176 push(@thresholds, split(/_/,$_[6])); | |
1177 sub thresholdCheck; | |
1178 my %micros = (); | |
1179 while (my $line = <IN>){ | |
1180 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; | |
1181 next if $line =~ /\t\t/; | |
1182 if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
1183 my $key = join("\t",$1, $2, $3, $4, $5); | |
1184 # print $key, "#-#-#-#-#-#-#-#\n"; | |
1185 push (@{$micros{$key}},$line); | |
1186 } | |
1187 else{ | |
1188 } | |
1189 } | |
1190 close IN; | |
1191 my @deletedlines = (); | |
1192 | |
1193 my $linecount = 0; | |
1194 | |
1195 while(my $sine = <SEQ>){ | |
1196 my %microstart=(); | |
1197 my %microend=(); | |
1198 | |
1199 my @sields = split(/\t/,$sine); | |
1200 | |
1201 my $key = (); | |
1202 | |
1203 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
1204 $key = join("\t",$1, $2, $3, $4, $5); | |
1205 # print $key, "<-<-<-<-<-<-<-<\n"; | |
1206 } | |
1207 else{ | |
1208 } | |
1209 | |
1210 if (exists $micros{$key}){ | |
1211 $linecount++; | |
1212 my @microstring = @{$micros{$key}}; | |
1213 my @tempmicrostring = @{$micros{$key}}; | |
1214 | |
1215 foreach my $line (@tempmicrostring){ | |
1216 my @fields = split(/\t/,$line); | |
1217 my $start = $fields[$startcord]; | |
1218 my $end = $fields[$endcord]; | |
1219 push (@{$microstart{$start}},$line); | |
1220 push (@{$microend{$end}},$line); | |
1221 } | |
1222 my $firstflag = 'down'; | |
1223 while( my $line =shift(@microstring)){ | |
1224 # print "-----------\nline = $line "; | |
1225 chomp $line; | |
1226 my @fields = split(/\t/,$line); | |
1227 my $start = $fields[$startcord]; | |
1228 my $end = $fields[$endcord]; | |
1229 my $startmicro = $line; | |
1230 my $endmicro = $line; | |
1231 | |
1232 # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n"; | |
1233 | |
1234 delete ($microstart{$start}); | |
1235 delete ($microend{$end}); | |
1236 my $flag = 'down'; | |
1237 my $startflag = 'down'; | |
1238 my $endflag = 'down'; | |
1239 my $prestart = $start - $distance; | |
1240 my $postend = $end + $distance; | |
1241 my @compoundlines = (); | |
1242 my %compoundhash = (); | |
1243 push (@compoundlines, $line); | |
1244 push (@{$compoundhash{$line}},$line); | |
1245 my $startrank = 1; | |
1246 my $endrank = 1; | |
1247 | |
1248 while( ($startflag eq "down") || ($endflag eq "down") ){ | |
1249 if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) { | |
1250 # print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n"; | |
1251 last; | |
1252 } | |
1253 | |
1254 $firstflag = "up"; | |
1255 if ($startflag eq "down"){ | |
1256 for my $i ($prestart ... $start){ | |
1257 | |
1258 if(exists $microend{$i}){ | |
1259 chomp $microend{$i}[0]; | |
1260 if(exists $compoundhash{$microend{$i}[0]}) {next;} | |
1261 # print "sending from microend $startmicro, $microend{$i}[0] |||\n"; | |
1262 if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){ | |
1263 push(@compoundlines, $microend{$i}[0]); | |
1264 # print "accepted\n"; | |
1265 my @tields = split(/\t/,$microend{$i}[0]); | |
1266 $startmicro = $microend{$i}[0]; | |
1267 chomp $startmicro; | |
1268 $start = $tields[$startcord]; | |
1269 $flag = 'down'; | |
1270 $startrank++; | |
1271 # print "startcompund = $microend{$i}[0]\n"; | |
1272 delete $microend{$i}; | |
1273 delete $microstart{$start}; | |
1274 $startflag = 'down'; | |
1275 $prestart = $start - $distance; | |
1276 last; | |
1277 } | |
1278 else{ | |
1279 $flag = 'up'; | |
1280 $startflag = 'up'; | |
1281 } | |
1282 } | |
1283 else{ | |
1284 $flag = 'up'; | |
1285 $startflag = 'up'; | |
1286 } | |
1287 } | |
1288 } | |
1289 | |
1290 $endrank = $startrank; | |
1291 | |
1292 if ($endflag eq "down"){ | |
1293 for my $i ($end ... $postend){ | |
1294 | |
1295 if(exists $microstart{$i} ){ | |
1296 chomp $microstart{$i}[0]; | |
1297 if(exists $compoundhash{$microstart{$i}[0]}) {next;} | |
1298 # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n"; | |
1299 | |
1300 if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){ | |
1301 push(@compoundlines, $microstart{$i}[0]); | |
1302 # print "accepted\n"; | |
1303 my @tields = split(/\t/,$microstart{$i}[0]); | |
1304 $end = $tields[$endcord]-0; | |
1305 $endmicro = $microstart{$i}[0]; | |
1306 $endrank++; | |
1307 chomp $endmicro; | |
1308 $flag = 'down'; | |
1309 # print "endcompund = $microstart{$i}[0]\n"; | |
1310 delete $microstart{$i}; | |
1311 delete $microend{$end}; | |
1312 shift @microstring; | |
1313 $postend = $end + $distance; | |
1314 $endflag = 'down'; | |
1315 last; | |
1316 } | |
1317 else{ | |
1318 $flag = 'up'; | |
1319 $endflag = 'up'; | |
1320 } | |
1321 } | |
1322 else{ | |
1323 $flag = 'up'; | |
1324 $endflag = 'up'; | |
1325 } | |
1326 } | |
1327 } | |
1328 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; | |
1329 } #end while( $flag eq "down") | |
1330 # print "compoundlines = @compoundlines \n"; | |
1331 if (scalar (@compoundlines) == 1){ | |
1332 print OUT $line,"\n"; | |
1333 } | |
1334 if (scalar (@compoundlines) > 1){ | |
1335 my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]); | |
1336 # print $compoundline,"\n"; | |
1337 print OUT2 $compoundline,"\n"; | |
1338 } | |
1339 } #end foreach my $line (@microstring){ | |
1340 } #if (exists $micros{$key}){ | |
1341 | |
1342 | |
1343 } | |
1344 | |
1345 close OUT; | |
1346 close OUT2; | |
1347 } | |
1348 | |
1349 | |
1350 #------------------------------------------------------------------------------------------------ | |
1351 sub compoundify{ | |
1352 my ($compoundlines, $sequence) = @_; | |
1353 # print "\nfound to compound : @$compoundlines and$sequence \n"; | |
1354 my $noOfComps = @$compoundlines; | |
1355 # print "Number of elements in hash is $noOfComps\n"; | |
1356 my @starts; | |
1357 my @ends; | |
1358 foreach my $line (@$compoundlines){ | |
1359 # print "compoundify.. line = $line \n"; | |
1360 chomp $line; | |
1361 my @fields = split(/\t/,$line); | |
1362 my $start = $fields[$startcord]; | |
1363 my $end = $fields[$endcord]; | |
1364 # print "start = $start, end = $end \n"; | |
1365 push(@starts, $start); | |
1366 push(@ends,$end); | |
1367 } | |
1368 my @temp = @$compoundlines; | |
1369 my $startline=$temp[0]; | |
1370 my @mields = split(/\t/,$startline); | |
1371 my $startcoord = $mields[$startcord]; | |
1372 my $startgapsign=$mields[$endcord]; | |
1373 my @startsorted = sort { $a <=> $b } @starts; | |
1374 my @endsorted = sort { $a <=> $b } @ends; | |
1375 my @intervals; | |
1376 for my $end (0 ... (scalar(@endsorted)-2)){ | |
1377 my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1)); | |
1378 push(@intervals,$interval); | |
1379 # print "interval = $interval =\n"; | |
1380 # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n"; | |
1381 } | |
1382 push(@intervals,""); | |
1383 my $compoundmicrosat=(); | |
1384 my $multiunit=""; | |
1385 foreach my $line (@$compoundlines){ | |
1386 my @fields = split(/\t/,$line); | |
1387 my $component="[".$fields[$microsatcord]."]".shift(@intervals); | |
1388 $compoundmicrosat=$compoundmicrosat.$component; | |
1389 $multiunit=$multiunit."[".$fields[$motifcord]."]"; | |
1390 # print "multiunit = $multiunit\n"; | |
1391 } | |
1392 my $compoundcopy = $compoundmicrosat; | |
1393 $compoundcopy =~ s/\[|\]//g; | |
1394 my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1; | |
1395 | |
1396 | |
1397 my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit)); | |
1398 return $compoundline; | |
1399 } | |
1400 | |
1401 #------------------------------------------------------------------------------------------------ | |
1402 | |
1403 sub identityMatch_thresholdCheck{ | |
1404 my $line1 = $_[0]; | |
1405 my $line2 = $_[1]; | |
1406 my $rank = $_[2]; | |
1407 my @lields1 = split(/\t/,$line1); | |
1408 my @lields2 = split(/\t/,$line2); | |
1409 # print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n"; | |
1410 | |
1411 if (length($lields1[$motifcord]) == length($lields2[$motifcord])){ | |
1412 my $probe = $lields1[$motifcord].$lields1[$motifcord]; | |
1413 #print "$probe :: $lields2[$motifcord]\n"; | |
1414 return "proceed" if $probe =~ /$lields2[$motifcord]/; | |
1415 #print "line recieved\n"; | |
1416 if ($rank ==1){ | |
1417 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; | |
1418 } | |
1419 else { | |
1420 return "proceed" if thresholdCheck($line2) eq "proceed"; | |
1421 return "stop"; | |
1422 } | |
1423 } | |
1424 else{ | |
1425 if ($rank ==1){ | |
1426 return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; | |
1427 } | |
1428 else { | |
1429 return "proceed" if thresholdCheck($line2) eq "proceed"; | |
1430 return "stop"; | |
1431 } | |
1432 } | |
1433 return "stop"; | |
1434 } | |
1435 #------------------------------------------------------------------------------------------------ | |
1436 | |
1437 sub thresholdCheck{ | |
1438 my @checkthresholds=(0,@thresholds); | |
1439 #print "IN thresholdCheck: @_\n"; | |
1440 my $line = $_[0]; | |
1441 my @lields = split(/\t/,$line); | |
1442 return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])]; | |
1443 return "stop"; | |
1444 } | |
1445 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx | |
1446 | |
1447 | |
1448 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx | |
1449 | |
1450 sub multiSpecies_filtering_interrupted_microsats{ | |
1451 # print "IN multiSpecies_filtering_interrupted_microsats: @_\n"; | |
1452 my $unfiltered = $_[0]; | |
1453 my $filtered = $_[1]; | |
1454 my $residue = $_[2]; | |
1455 my $no_of_species = $_[5]; | |
1456 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; | |
1457 open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; | |
1458 open(RES,">$residue") or die "Cannot open file $residue: $!"; | |
1459 | |
1460 $infocord = 2 + (4*$no_of_species) - 1; | |
1461 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
1462 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
1463 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
1464 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
1465 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
1466 | |
1467 | |
1468 my @sub_thresholds = (0); | |
1469 | |
1470 push(@sub_thresholds, split(/_/,$_[3])); | |
1471 my @thresholds = (0); | |
1472 | |
1473 push(@thresholds, split(/_/,$_[4])); | |
1474 | |
1475 while (my $line = <UNF>) { | |
1476 next if $line !~ /[a-z]/; | |
1477 #print $line; | |
1478 chomp $line; | |
1479 my @fields = split(/\t/,$line); | |
1480 my $motif = $fields[$motifcord]; | |
1481 my $realmotif = $motif; | |
1482 #print "motif = $motif\n"; | |
1483 if ($motif =~ /^\[/){ | |
1484 $motif =~ s/^\[//g; | |
1485 my @motifs = split(/\]/,$motif); | |
1486 $realmotif = $motifs[0]; | |
1487 } | |
1488 # print "realmotif = $realmotif"; | |
1489 my $motif_size = length($realmotif); | |
1490 | |
1491 my $microsat = $fields[$microsatcord]; | |
1492 # print "microsat = $microsat\n"; | |
1493 $microsat =~ s/^\[|\]$//sg; | |
1494 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); | |
1495 | |
1496 $microsat = join("",@microsats); | |
1497 if (length($microsat) < $thresholds[$motif_size]) { | |
1498 # print length($microsat)," < ",$thresholds[$motif_size],"\n"; | |
1499 print RES $line,"\n"; next; | |
1500 } | |
1501 my @lengths = (); | |
1502 foreach my $mic (@microsats){ | |
1503 push(@lengths, length($mic)); | |
1504 } | |
1505 if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) { | |
1506 # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n"; | |
1507 print RES $line,"\n"; next;} | |
1508 else {print FIL $line,"\n"; next; | |
1509 } | |
1510 } | |
1511 close FIL; | |
1512 close RES; | |
1513 | |
1514 } | |
1515 | |
1516 sub largest_microsat{ | |
1517 my $counter = 0; | |
1518 my($max) = shift(@_); | |
1519 foreach my $temp (@_) { | |
1520 #print "finding largest array: $maxcounter \n"; | |
1521 if($temp > $max){ | |
1522 $max = $temp; | |
1523 } | |
1524 } | |
1525 return($max); | |
1526 } | |
1527 | |
1528 #xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx | |
1529 | |
1530 | |
1531 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx | |
1532 sub multiSpecies_compound_microsat_analyzer{ | |
1533 ####### PARAMETER ######## | |
1534 ########################## | |
1535 | |
1536 my $input1 = $_[0]; ###### the *_sput_op4_ii file | |
1537 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" | |
1538 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format | |
1539 my $output2 = $_[3]; ###### the pure compound microsatellites | |
1540 my $org = $_[4]; | |
1541 my $no_of_species = $_[5]; | |
1542 # print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n"; | |
1543 $infocord = 2 + (4*$no_of_species) - 1; | |
1544 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
1545 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
1546 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
1547 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
1548 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
1549 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
1550 | |
1551 open(IN,"<$input1") or die "Cannot open file $input1 $!"; | |
1552 open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; | |
1553 | |
1554 open(OUT,">$output1") or die "Cannot open file $output1 $!"; | |
1555 open(OUT2,">$output2") or die "Cannot open file $output2 $!"; | |
1556 | |
1557 | |
1558 # print "opened files \n"; | |
1559 my %micros = (); | |
1560 my $keycounter=0; | |
1561 my $linecounter=0; | |
1562 while (my $line = <IN>){ | |
1563 $linecounter++; | |
1564 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
1565 my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); | |
1566 push (@{$micros{$key}},$line); | |
1567 $keycounter++; | |
1568 } | |
1569 else{ | |
1570 # print "no key\n"; | |
1571 } | |
1572 } | |
1573 close IN; | |
1574 my @deletedlines = (); | |
1575 # print "done hash . linecounter=$linecounter, keycounter=$keycounter\n"; | |
1576 #--------------------------------------------------------------------------------------------------- | |
1577 # NOW READING THE SEQUENCE FILE | |
1578 my $keyfound=0; | |
1579 my $keyexists=0; | |
1580 my $inter=0; | |
1581 my $pure=0; | |
1582 | |
1583 while(my $sine = <SEQ>){ | |
1584 my %microstart=(); | |
1585 my %microend=(); | |
1586 my @sields = split(/\t/,$sine); | |
1587 my $key = 0; | |
1588 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
1589 $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); | |
1590 # print $sine; | |
1591 # print $key; | |
1592 $keyfound++; | |
1593 } | |
1594 else{ | |
1595 | |
1596 } | |
1597 #<STDIN> if !defined $key; | |
1598 | |
1599 if (exists $micros{$key}){ | |
1600 $keyexists++; | |
1601 my @microstring = @{$micros{$key}}; | |
1602 | |
1603 my @filteredmicrostring; | |
1604 | |
1605 foreach my $line (@microstring){ | |
1606 chomp $line; | |
1607 my $copy_line = $line; | |
1608 my @fields = split(/\t/,$line); | |
1609 my $start = $fields[$startcord]; | |
1610 my $end = $fields[$endcord]; | |
1611 # FOR COMPOUND MICROSATELLITES | |
1612 if ($fields[$typecord] eq "compound"){ | |
1613 $line = compound_microsat_analyser($line); | |
1614 if ($line eq "NULL") { | |
1615 print OUT2 "$copy_line\n"; | |
1616 $pure++; | |
1617 next; | |
1618 } | |
1619 else{ | |
1620 print OUT "$line\n"; | |
1621 $inter++; | |
1622 next; | |
1623 } | |
1624 } | |
1625 } | |
1626 | |
1627 } #if (exists $micros{$key}){ | |
1628 } | |
1629 close OUT; | |
1630 close OUT2; | |
1631 # print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n"; | |
1632 } | |
1633 | |
1634 sub compound_microsat_analyser{ | |
1635 my $line = $_[0]; | |
1636 my @fields = split(/\t/,$line); | |
1637 my $motifline = $fields[$motifcord]; | |
1638 my $microsat = $fields[$microsatcord]; | |
1639 $motifline =~ s/^\[|\]$//g; | |
1640 $microsat =~ s/^\[|\]$//g; | |
1641 $microsat =~ s/-//g; | |
1642 my @interruptions = (); | |
1643 my @motields = split(/\]\[/,$motifline); | |
1644 my @microields = split(/\][a-zA-Z|-]*\[/,$microsat); | |
1645 my @inields = split(/[.*]/,$microsat); | |
1646 shift @inields; | |
1647 my @motifcount = scalar(@motields); | |
1648 my $prevmotif = $motields[0]; | |
1649 my $prevmicro = $microields[0]; | |
1650 my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0])); | |
1651 my $localflag = 'down'; | |
1652 my @infoarray = (); | |
1653 | |
1654 for my $l (1 ... (scalar(@motields)-1)){ | |
1655 my $probe = $prevmotif.$prevmotif; | |
1656 if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;} | |
1657 | |
1658 if ($probe =~ /$motields[$l]/i){ | |
1659 my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l])); | |
1660 my $curr_startphase = substr($microields[$l],0,length($motields[$l])); | |
1661 if ($curr_startphase =~ /$prevphase/i) { | |
1662 $infoarray[$l-1] = "insertion"; | |
1663 } | |
1664 else { | |
1665 $infoarray[$l-1] = "indel/substitution"; | |
1666 } | |
1667 | |
1668 $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase; | |
1669 next; | |
1670 } | |
1671 else {$localflag = "up"; last;} | |
1672 } | |
1673 if ($localflag eq 'up') {return "NULL";} | |
1674 | |
1675 if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";} | |
1676 if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";} | |
1677 if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";} | |
1678 if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";} | |
1679 if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";} | |
1680 | |
1681 @microields = split(/[\[|\]]/,$microsat); | |
1682 my @microsats = (); | |
1683 my @positions = (); | |
1684 my $lengthtracker = 0; | |
1685 | |
1686 for my $i (0 ... (scalar(@microields ) - 1)){ | |
1687 if ($i%2 == 0){ | |
1688 push(@microsats,$microields[$i]); | |
1689 $lengthtracker = $lengthtracker + length($microields[$i]); | |
1690 | |
1691 } | |
1692 else{ | |
1693 push(@interruptions,$microields[$i]); | |
1694 push(@positions, $lengthtracker+1); | |
1695 $lengthtracker = $lengthtracker + length($microields[$i]); | |
1696 } | |
1697 | |
1698 } | |
1699 my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions))); | |
1700 return($returnline); | |
1701 } | |
1702 | |
1703 #xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx | |
1704 | |
1705 | |
1706 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx | |
1707 | |
1708 sub multiSpecies_compoundClarifyer{ | |
1709 # print "IN multiSpecies_compoundClarifyer: @_\n"; | |
1710 my $input1 = $_[0]; ###### the *_sput_compound | |
1711 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" | |
1712 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format | |
1713 my $output2 = $_[3]; ###### compound file | |
1714 my $org = $_[4]; | |
1715 my $no_of_species = $_[5]; | |
1716 @thresholds = "0"; | |
1717 push(@thresholds, split(/_/,$_[6])); | |
1718 | |
1719 | |
1720 $infocord = 2 + (4*$no_of_species) - 1; | |
1721 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
1722 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
1723 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
1724 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
1725 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
1726 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
1727 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
1728 | |
1729 $interr_poscord = $motifcord + 3; | |
1730 $no_of_interruptionscord = $motifcord + 4; | |
1731 $interrcord = $motifcord + 2; | |
1732 $interrtypecord = $motifcord + 1; | |
1733 | |
1734 | |
1735 open(IN,"<$input1") or die "Cannot open file $input1 $!"; | |
1736 open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; | |
1737 | |
1738 open(INT,">$output1") or die "Cannot open file $output2 $!"; | |
1739 open(COMP,">$output2") or die "Cannot open file $output2 $!"; | |
1740 #open(CH,">changed") or die "Cannot open file changed $!"; | |
1741 | |
1742 # print "opened files \n"; | |
1743 my $linecounter = 0; | |
1744 my $microcounter = 0; | |
1745 | |
1746 my %micros = (); | |
1747 while (my $line = <IN>){ | |
1748 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; | |
1749 $linecounter++; | |
1750 if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { | |
1751 my $key = join("\t",$1, $2, $3, $4); | |
1752 # print $key, "#-#-#-#-#-#-#-#\n"; | |
1753 # print "key = $key\n"; | |
1754 push (@{$micros{$key}},$line); | |
1755 $microcounter++; | |
1756 } | |
1757 else {#print $line," key not made\n"; <STDIN>; | |
1758 } | |
1759 } | |
1760 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
1761 close IN; | |
1762 my @deletedlines = (); | |
1763 # print "done hash \n"; | |
1764 $linecounter = 0; | |
1765 #--------------------------------------------------------------------------------------------------- | |
1766 # NOW READING THE SEQUENCE FILE | |
1767 my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide); | |
1768 $printer = 0; | |
1769 | |
1770 while(my $sine = <SEQ>){ | |
1771 my %microstart=(); | |
1772 my %microend=(); | |
1773 my @sields = split(/\t/,$sine); | |
1774 my $key = (); | |
1775 | |
1776 # print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>; | |
1777 | |
1778 if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { | |
1779 | |
1780 # if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) { | |
1781 $key = join("\t",$1, $2, $3, $4); | |
1782 # print "key = $key\n"; | |
1783 } | |
1784 else{ | |
1785 # print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n"; | |
1786 } | |
1787 | |
1788 if (exists $micros{$key}){ | |
1789 my @microstring = @{$micros{$key}}; | |
1790 delete $micros{$key}; | |
1791 | |
1792 foreach my $line (@microstring){ | |
1793 # print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1; | |
1794 # print "microsat = $line" if $printer == 1; | |
1795 $linecounter++; | |
1796 my $copy_line = $line; | |
1797 my @mields = split(/\t/,$line); | |
1798 my @fields = @mields; | |
1799 my $start = $fields[$startcord]; | |
1800 my $end = $fields[$endcord]; | |
1801 my $microsat = $fields[$microsatcord]; | |
1802 my $motifline = $fields[$motifcord]; | |
1803 my $microsatcopy = $microsat; | |
1804 my $positioner = $microsat; | |
1805 $positioner =~ s/[a-zA-Z|-]/_/g; | |
1806 $microsatcopy =~ s/^\[|\]$//gs; | |
1807 chomp $microsatcopy; | |
1808 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); | |
1809 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); | |
1810 my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start); | |
1811 # print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1; | |
1812 shift @inields; | |
1813 #print "inields =@inields<\n"; | |
1814 $motifline =~ s/^\[|\]$//gs; | |
1815 chomp $motifline; | |
1816 #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n"; | |
1817 my @motields = split(/\]\[/,$motifline); | |
1818 my $seq = $microsatcopy; | |
1819 $seq =~ s/\[|\]//g; | |
1820 my $seqlen = length($seq); | |
1821 $seq = " ".$seq; | |
1822 | |
1823 my $longestmotif_no = longest_array_element(@motields); | |
1824 my $shortestmotif_no = shortest_array_element(@motields); | |
1825 #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n"; | |
1826 | |
1827 my $search = $motields[$longestmotif_no].$motields[$longestmotif_no]; | |
1828 if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){ | |
1829 print COMP $line; | |
1830 next; | |
1831 } | |
1832 | |
1833 my @shortestmotif_nos = (); | |
1834 for my $m (0 ... $#motields){ | |
1835 push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) ); | |
1836 } | |
1837 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ | |
1838 my $newleft =(); | |
1839 my $leftstopper = 0; my $rightstopper = 0; | |
1840 foreach my $shortmotif_no (@shortestmotif_nos){ | |
1841 next if $shortmotif_no == 0; | |
1842 my $last_left = $shortmotif_no; #$#motields; | |
1843 my $last_hitter = 0; | |
1844 for (my $i =($shortmotif_no-1); $i>=0; $i--){ | |
1845 my $search = $motields[$shortmotif_no]; | |
1846 if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} | |
1847 if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ | |
1848 $last_hitter = 1; | |
1849 $last_left = $i+1; last; | |
1850 } | |
1851 my $probe = $motields[$i]; | |
1852 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} | |
1853 | |
1854 if ($probe !~ /$search/){ | |
1855 $last_hitter = 1; | |
1856 $last_left = $i+1; | |
1857 # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n"; | |
1858 last; | |
1859 } | |
1860 $last_left--;$last_hitter = 1; | |
1861 # print "passed tests, last left = $last_left\n"; | |
1862 } | |
1863 # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n"; | |
1864 if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;} | |
1865 else {$leftstopper = 1; | |
1866 # print "leftstopper = 1\n"; | |
1867 } | |
1868 } | |
1869 | |
1870 ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ | |
1871 my $newright =(); | |
1872 foreach my $shortmotif_no (@shortestmotif_nos){ | |
1873 next if $shortmotif_no == $#motields; | |
1874 my $last_right = $shortmotif_no;# -1; | |
1875 for my $i ($shortmotif_no+1 ... $#motields){ | |
1876 my $search = $motields[$shortmotif_no]; | |
1877 if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} | |
1878 if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ | |
1879 $last_right = $i-1; last; | |
1880 } | |
1881 my $probe = $motields[$i]; | |
1882 if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} | |
1883 if ( $probe !~ /$search/){ | |
1884 $last_right = $i-1; last; | |
1885 } | |
1886 $last_right++; | |
1887 } | |
1888 if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n"; | |
1889 } | |
1890 else {$rightstopper = 1; | |
1891 } | |
1892 } | |
1893 | |
1894 | |
1895 if ($rightstopper == 1 && $leftstopper == 1){ | |
1896 print COMP $line; | |
1897 # print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1; | |
1898 next; | |
1899 } | |
1900 | |
1901 # print "pased initial testing phase \n" if $printer == 1; | |
1902 my @outputs = (); | |
1903 my @orig_starts = (); | |
1904 my @orig_ends = (); | |
1905 for my $mic (0 ... $#microields){ | |
1906 my $miclen = length($microields[$mic]); | |
1907 my $microleftlen = 0; | |
1908 #print "\nmic = $mic\n"; | |
1909 if($mic > 0){ | |
1910 for my $submin (0 ... $mic-1){ | |
1911 my $interval = (); | |
1912 if (!exists $inields[$submin]) {$interval = "";} | |
1913 else {$interval = $inields[$submin];} | |
1914 #print "inield =$interval< and microield =$microields[$submin]<\n "; | |
1915 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); | |
1916 } | |
1917 } | |
1918 push(@orig_starts,($microleftlen+1)); | |
1919 push(@orig_ends, ($microleftlen+1 + $miclen -1)); | |
1920 } | |
1921 | |
1922 ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ | |
1923 | |
1924 | |
1925 for my $mic (0 ... $#microields){ | |
1926 my $miclen = length($microields[$mic]); | |
1927 my $microleftlen = 0; | |
1928 if($mic > 0){ | |
1929 for my $submin (0 ... $mic-1){ | |
1930 # if(!exists $inields[$submin]) {$inields[$submin] = "";} | |
1931 my $interval = (); | |
1932 if (!exists $inields[$submin]) {$interval = "";} | |
1933 else {$interval = $inields[$submin];} | |
1934 #print "inield =$interval< and microield =$microields[$submin]<\n "; | |
1935 $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); | |
1936 } | |
1937 } | |
1938 $fields[$startcord] = $microleftlen+1; | |
1939 $fields[$endcord] = $fields[$startcord] + $miclen -1; | |
1940 $fields[$typecord] = $microsat_types[length($motields[$mic])]; | |
1941 $fields[$microsatcord] = $microields[$mic]; | |
1942 $fields[$motifcord] = $motields[$mic]; | |
1943 my $templine = join("\t", (@fields[0 .. $motifcord]) ); | |
1944 my $orig_templine = join("\t", (@fields[0 .. $motifcord]) ); | |
1945 my $newline; | |
1946 my $lefter = 1; my $righter = 1; | |
1947 if ( $fields[$startcord] < 2){$lefter = 0;} | |
1948 if ($fields[$endcord] == $seqlen){$righter = 0;} | |
1949 | |
1950 while($lefter == 1){ | |
1951 $newline = left_extender($templine, $seq,$org); | |
1952 # print "returned line from left extender= $newline \n" if $printer == 1; | |
1953 if ($newline eq $templine){$templine = $newline; last;} | |
1954 else {$templine = $newline;} | |
1955 | |
1956 if (left_extention_permission_giver($templine) eq "no") {last;} | |
1957 } | |
1958 while($righter == 1){ | |
1959 $newline = right_extender($templine, $seq,$org); | |
1960 # print "returned line from right extender= $newline \n" if $printer == 1; | |
1961 if ($newline eq $templine){$templine = $newline; last;} | |
1962 else {$templine = $newline;} | |
1963 if (right_extention_permission_giver($templine) eq "no") {last;} | |
1964 } | |
1965 my @tempfields = split(/\t/,$templine); | |
1966 $tempfields[$microsatcord] =~ s/\]|\[//g; | |
1967 $tempfields[$motifcord] =~ s/^\[|\]$//gs; | |
1968 my @tempmotields = split(/\]\[/,$tempfields[$motifcord]); | |
1969 | |
1970 if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) { | |
1971 # print "scalar ( tempmotields) = 1\n" if $printer == 1; | |
1972 next; | |
1973 } | |
1974 my $prevmotif = shift(@tempmotields); | |
1975 my $stopper = 0; | |
1976 | |
1977 foreach my $tempmot (@tempmotields){ | |
1978 if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;} | |
1979 my $search = $prevmotif.$prevmotif; | |
1980 if ($search !~ /$tempmot/) {$stopper = 1; last;} | |
1981 $prevmotif = $tempmot; | |
1982 } | |
1983 if ( $stopper == 1) { | |
1984 # print "length tempmot != length prevmotif\n" if $printer == 1; | |
1985 next; | |
1986 } | |
1987 my $lastend = 0; | |
1988 #---------------------------------------------------------- | |
1989 my $left_captured = (); my $right_captured = (); | |
1990 my $left_bp = (); my $right_bp = (); | |
1991 # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n"; | |
1992 for my $o (0 ... $#orig_starts){ | |
1993 # print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1; | |
1994 # print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1; | |
1995 | |
1996 if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord]) | |
1997 # print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1; | |
1998 $left_captured = $o; | |
1999 $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1; | |
2000 } | |
2001 elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord]) | |
2002 # print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1; | |
2003 $right_captured = $o; | |
2004 $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1; | |
2005 } | |
2006 $lastend = $orig_ends[$o] | |
2007 } | |
2008 # print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1; | |
2009 my $leftmotif = (); my $left_trashed = (); | |
2010 if ($tempfields[$startcord] != $fields[$startcord]) { | |
2011 $leftmotif = $motields[$left_captured]; | |
2012 # print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1; | |
2013 if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n"; | |
2014 } | |
2015 $left_trashed = length($microields[$left_captured]) - $left_bp; | |
2016 } | |
2017 my $rightmotif = (); my $right_trashed = (); | |
2018 if ($tempfields[$endcord] != $fields[$endcord]) { | |
2019 # print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1; | |
2020 $rightmotif = $motields[$right_captured]; | |
2021 $right_trashed = length($microields[$right_captured]) - $right_bp; | |
2022 } | |
2023 | |
2024 ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ | |
2025 $stopper = 0; | |
2026 my $deletioner = 0; | |
2027 #if($tempfields[$startcord] != $fields[$startcord]){ | |
2028 # print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1; | |
2029 if ($left_captured != 0){ | |
2030 # print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1; | |
2031 for my $e (0 ... $left_captured-1){ | |
2032 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){ | |
2033 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2034 $deletioner++; last; | |
2035 } | |
2036 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ | |
2037 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2038 $deletioner++; last; | |
2039 } | |
2040 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ | |
2041 # print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2042 $deletioner++; last; | |
2043 } | |
2044 } | |
2045 } | |
2046 #} | |
2047 # print "after left search, deletioner = $deletioner\n" if $printer == 1; | |
2048 if ($deletioner >= 1) { | |
2049 # print "deletioner = $deletioner\n" if $printer == 1; | |
2050 next; | |
2051 } | |
2052 | |
2053 $deletioner = 0; | |
2054 | |
2055 #if($tempfields[$endcord] != $fields[$endcord]){ | |
2056 # print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1; | |
2057 if ($right_captured != $#microields){ | |
2058 # print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1; | |
2059 for my $e ($right_captured+1 ... $#microields){ | |
2060 if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){ | |
2061 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2062 $deletioner++; last; | |
2063 } | |
2064 if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ | |
2065 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2066 $deletioner++; last; | |
2067 } | |
2068 if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ | |
2069 # print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; | |
2070 $deletioner++; last; | |
2071 } | |
2072 } | |
2073 } | |
2074 #} | |
2075 # print "deletioner = $deletioner\n" if $printer == 1; | |
2076 if ($deletioner >= 1) { | |
2077 next; | |
2078 } | |
2079 my $leftMotifs_notCaptured = (); | |
2080 my $rightMotifs_notCaptured = (); | |
2081 | |
2082 if ($tempfields[$startcord] != $fields[$startcord] ){ | |
2083 #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n"; | |
2084 if (length($leftmotif) == 1 && $left_trashed > 3){ | |
2085 # print "invaded left motif is long mononucleotide" if $printer == 1; | |
2086 next; | |
2087 | |
2088 } | |
2089 elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){ | |
2090 # print "invaded left motif too long" if $printer == 1; | |
2091 next; | |
2092 } | |
2093 } | |
2094 if ($tempfields[$endcord] != $fields[$endcord] ){ | |
2095 #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n"; | |
2096 if (length($rightmotif)==1 && $right_trashed){ | |
2097 # print "invaded right motif is long mononucleotide" if $printer == 1; | |
2098 next; | |
2099 | |
2100 } | |
2101 elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){ | |
2102 # print "invaded right motif too long" if $printer == 1; | |
2103 next; | |
2104 | |
2105 } | |
2106 } | |
2107 push @outputs, $templine; | |
2108 } | |
2109 if (scalar(@outputs) == 0){ print COMP $line; next;} | |
2110 # print "outputs are:", join("\n",@outputs),"\n"; | |
2111 if (scalar(@outputs) == 1){ | |
2112 my @oields = split(/\t/,$outputs[0]); | |
2113 my $start = $oields[$startcord]+$mields[$startcord]-1; | |
2114 my $end = $start+($oields[$endcord]-$oields[$startcord]); | |
2115 $oields[$startcord] = $start; $oields[$endcord] = $end; | |
2116 print INT join("\t",@oields), "\n"; | |
2117 # print CH $line,; | |
2118 } | |
2119 if (scalar(@outputs) > 1){ | |
2120 my $motif_min = 10; | |
2121 my $chosen_one = $outputs[0]; | |
2122 foreach my $micro (@outputs){ | |
2123 my @oields = split(/\t/,$micro); | |
2124 my $tempmotif = $oields[$motifcord]; | |
2125 $tempmotif =~ s/^\[|\]$//gs; | |
2126 my @omots = split(/\]\[/, $tempmotif); | |
2127 # print "motif_min = $motif_min, current motif = $tempmotif\n"; | |
2128 my $start = $oields[$startcord]+$mields[$startcord]-1; | |
2129 my $end = $start+($oields[$endcord]-$oields[$startcord]); | |
2130 $oields[$startcord] = $start; $oields[$endcord] = $end; | |
2131 if(length($omots[0]) < $motif_min) { | |
2132 $chosen_one = join("\t",@oields); | |
2133 $motif_min = length($omots[0]); | |
2134 } | |
2135 } | |
2136 print INT $chosen_one, "\n"; | |
2137 # print "chosen one is ".$chosen_one, "\n"; | |
2138 # print CH $line; | |
2139 | |
2140 | |
2141 } | |
2142 | |
2143 } | |
2144 | |
2145 } #if (exists $micros{$key}){ | |
2146 else{ | |
2147 } | |
2148 } | |
2149 close INT; | |
2150 close COMP; | |
2151 } | |
2152 sub left_extender{ | |
2153 #print "left extender\n"; | |
2154 my ($line, $seq, $org) = @_; | |
2155 # print "in left extender... line passed = $line and sequence is $seq\n"; | |
2156 chomp $line; | |
2157 my @fields = split(/\t/,$line); | |
2158 my $rstart = $fields[$startcord]; | |
2159 my $microsat = $fields[$microsatcord]; | |
2160 $microsat =~ s/\[|\]//g; | |
2161 my $rend = $rstart + length($microsat)-1; | |
2162 $microsat =~ s/-//g; | |
2163 my $motif = $fields[$motifcord]; | |
2164 my $firstmotif = (); | |
2165 | |
2166 if ($motif =~ /^\[/){ | |
2167 $motif =~ s/^\[//g; | |
2168 $motif =~ /([a-zA-Z]+)\].*/; | |
2169 $firstmotif = $1; | |
2170 } | |
2171 else {$firstmotif = $motif;} | |
2172 | |
2173 #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n"; | |
2174 my $leftphase = substr($microsat, 0,length($firstmotif)); | |
2175 my $phaser = $leftphase.$leftphase; | |
2176 my @phase = split(/\s*/,$leftphase); | |
2177 my @phases; | |
2178 my @copy_phases = @phases; | |
2179 my $crawler=0; | |
2180 for (0 ... (length($leftphase)-1)){ | |
2181 push(@phases, substr($phaser, $crawler, length($leftphase))); | |
2182 $crawler++; | |
2183 } | |
2184 | |
2185 my $start = $rstart; | |
2186 my $end = $rend; | |
2187 | |
2188 my $leftseq = substr($seq, 0, $start); | |
2189 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n"; | |
2190 my @extentions = (); | |
2191 my @trappeds = (); | |
2192 my @intervalposs = (); | |
2193 my @trappedposs = (); | |
2194 my @trappedphases = (); | |
2195 my @intervals = (); | |
2196 my $firstmotif_length = length($firstmotif); | |
2197 foreach my $phase (@phases){ | |
2198 # print "left phase\t",substr($leftseq, -10),"\t$phase\n"; | |
2199 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n"; | |
2200 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ | |
2201 # print "in left pattern\n"; | |
2202 my $trapped = $1; | |
2203 my $trappedpos = length($leftseq)-length($trapped); | |
2204 my $interval = $3; | |
2205 my $intervalpos = index($trapped, $interval) + 1; | |
2206 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n"; | |
2207 | |
2208 my $extention = substr($trapped, 0, length($trapped)-length($interval)); | |
2209 my $leftpeep = substr($seq, 0, ($start-length($trapped))); | |
2210 my @passed_overhangs; | |
2211 | |
2212 for my $i (1 ... length($phase)-1){ | |
2213 my $overhang = substr($phase, -length($phase)+$i); | |
2214 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n"; | |
2215 #TEMPORARY... BETTER METHOD NEEDED | |
2216 $leftpeep =~ s/-//g; | |
2217 if ($leftpeep =~ /$overhang$/i){ | |
2218 push(@passed_overhangs,$overhang); | |
2219 # print "l overhang\n"; | |
2220 } | |
2221 } | |
2222 | |
2223 if(scalar(@passed_overhangs)>0){ | |
2224 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; | |
2225 $extention = $overhang.$extention; | |
2226 $trapped = $overhang.$trapped; | |
2227 #print "trapped extended to $trapped \n"; | |
2228 $trappedpos = length($leftseq)-length($trapped); | |
2229 } | |
2230 | |
2231 push(@extentions,$extention); | |
2232 # print "extentions = @extentions \n"; | |
2233 | |
2234 push(@trappeds,$trapped ); | |
2235 push(@intervalposs,length($extention)+1); | |
2236 push(@trappedposs, $trappedpos); | |
2237 # print "trappeds = @trappeds\n"; | |
2238 push(@trappedphases, substr($extention,0,length($phase))); | |
2239 push(@intervals, $interval); | |
2240 } | |
2241 } | |
2242 if (scalar(@trappeds == 0)) {return $line;} | |
2243 | |
2244 my $nikaal = shortest_array_element(@intervals); | |
2245 | |
2246 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} | |
2247 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; | |
2248 ##print "new fields 9 = $fields[9]\n"; | |
2249 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); | |
2250 | |
2251 if($fields[$microsatcord] !~ /^\[/i){ | |
2252 $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; | |
2253 } | |
2254 | |
2255 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; | |
2256 | |
2257 if (exists ($fields[$motifcord+1])){ | |
2258 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; | |
2259 } | |
2260 else{$fields[$motifcord+1] = "indel/deletion";} | |
2261 ##print "new fields 14 = $fields[14]\n"; | |
2262 | |
2263 if (exists ($fields[$motifcord+2])){ | |
2264 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; | |
2265 } | |
2266 else{$fields[$motifcord+2] = $intervals[$nikaal];} | |
2267 my @seventeen=(); | |
2268 if (exists ($fields[$motifcord+3])){ | |
2269 @seventeen = split(/,/,$fields[$motifcord+3]); | |
2270 # #print "scalarseventeen =@seventeen<-\n"; | |
2271 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} | |
2272 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); | |
2273 $fields[$motifcord+4] = $fields[$motifcord+4]+1; | |
2274 } | |
2275 | |
2276 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} | |
2277 | |
2278 ##print "new fields 16 = $fields[16]\n"; | |
2279 ##print "new fields 17 = $fields[17]\n"; | |
2280 | |
2281 | |
2282 my $returnline = join("\t",@fields); | |
2283 my $pastline = $returnline; | |
2284 if ($fields[$microsatcord] =~ /\[/){ | |
2285 $returnline = multiSpecies_compoundClarifyer_merge($returnline); | |
2286 } | |
2287 return $returnline; | |
2288 } | |
2289 sub right_extender{ | |
2290 my ($line, $seq, $org) = @_; | |
2291 chomp $line; | |
2292 my @fields = split(/\t/,$line); | |
2293 my $rstart = $fields[$startcord]; | |
2294 my $microsat = $fields[$microsatcord]; | |
2295 $microsat =~ s/\[|\]//g; | |
2296 my $rend = $rstart + length($microsat)-1; | |
2297 $microsat =~ s/-//g; | |
2298 my $motif = $fields[$motifcord]; | |
2299 my $temp_lastmotif = (); | |
2300 | |
2301 if ($motif =~ /\]$/s){ | |
2302 $motif =~ s/\]$//sg; | |
2303 $motif =~ /.*\[([a-zA-Z]+)/; | |
2304 $temp_lastmotif = $1; | |
2305 } | |
2306 else {$temp_lastmotif = $motif;} | |
2307 my $lastmotif = substr($microsat,-length($temp_lastmotif)); | |
2308 ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n"; | |
2309 my $rightphase = substr($microsat, -length($lastmotif)); | |
2310 my $phaser = $rightphase.$rightphase; | |
2311 my @phase = split(/\s*/,$rightphase); | |
2312 my @phases; | |
2313 my @copy_phases = @phases; | |
2314 my $crawler=0; | |
2315 for (0 ... (length($rightphase)-1)){ | |
2316 push(@phases, substr($phaser, $crawler, length($rightphase))); | |
2317 $crawler++; | |
2318 } | |
2319 | |
2320 my $start = $rstart; | |
2321 my $end = $rend; | |
2322 | |
2323 my $rightseq = substr($seq, $end+1); | |
2324 my @extentions = (); | |
2325 my @trappeds = (); | |
2326 my @intervalposs = (); | |
2327 my @trappedposs = (); | |
2328 my @trappedphases = (); | |
2329 my @intervals = (); | |
2330 my $lastmotif_length = length($lastmotif); | |
2331 foreach my $phase (@phases){ | |
2332 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ | |
2333 my $trapped = $1; | |
2334 my $trappedpos = $end+1; | |
2335 my $interval = $2; | |
2336 my $intervalpos = index($trapped, $interval) + 1; | |
2337 | |
2338 my $extention = substr($trapped, length($interval)); | |
2339 my $rightpeep = substr($seq, ($end+length($trapped))+1); | |
2340 my @passed_overhangs = ""; | |
2341 | |
2342 #TEMPORARY... BETTER METHOD NEEDED | |
2343 $rightpeep =~ s/-//g; | |
2344 | |
2345 for my $i (1 ... length($phase)-1){ | |
2346 my $overhang = substr($phase,0, $i); | |
2347 # #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n"; | |
2348 if ($rightpeep =~ /^$overhang/i){ | |
2349 push(@passed_overhangs, $overhang); | |
2350 # #print "r overhang\n"; | |
2351 } | |
2352 } | |
2353 if (scalar(@passed_overhangs) > 0){ | |
2354 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; | |
2355 $extention = $extention.$overhang; | |
2356 $trapped = $trapped.$overhang; | |
2357 # #print "trapped extended to $trapped \n"; | |
2358 } | |
2359 | |
2360 push(@extentions,$extention); | |
2361 ##print "extentions = @extentions \n"; | |
2362 | |
2363 push(@trappeds,$trapped ); | |
2364 push(@intervalposs,$intervalpos); | |
2365 push(@trappedposs, $trappedpos); | |
2366 # #print "trappeds = @trappeds\n"; | |
2367 push(@trappedphases, substr($extention,0,length($phase))); | |
2368 push(@intervals, $interval); | |
2369 } | |
2370 } | |
2371 if (scalar(@trappeds == 0)) {return $line;} | |
2372 | |
2373 # my $nikaal = longest_array_element(@trappeds); | |
2374 my $nikaal = shortest_array_element(@intervals); | |
2375 | |
2376 # #print "longest element found = $nikaal \n"; | |
2377 | |
2378 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} | |
2379 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; | |
2380 ##print "new fields 9 = $fields[9]"; | |
2381 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); | |
2382 | |
2383 ##print "new fields 11 = $fields[11]\n"; | |
2384 | |
2385 if($fields[$microsatcord] !~ /^\[/i){ | |
2386 $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; | |
2387 } | |
2388 | |
2389 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; | |
2390 ##print "new fields 12 = $fields[12]\n"; | |
2391 | |
2392 ##print "scalar of fields = ",scalar(@fields),"\n"; | |
2393 if (exists ($fields[$motifcord+1])){ | |
2394 # print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1]; | |
2395 # <STDIN> if !exists $fields[$motifcord+1]; | |
2396 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; | |
2397 } | |
2398 else{$fields[$motifcord+1] = "indel/deletion";} | |
2399 ##print "new fields 14 = $fields[14]\n"; | |
2400 | |
2401 if (exists ($fields[$motifcord+2])){ | |
2402 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; | |
2403 } | |
2404 else{$fields[$motifcord+2] = $intervals[$nikaal];} | |
2405 ##print "new fields 15 = $fields[15]\n"; | |
2406 | |
2407 my @seventeen=(); | |
2408 if (exists ($fields[$motifcord+3])){ | |
2409 ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n"; | |
2410 # print " print fields = @fields\n " if !exists $fields[$motifcord+3]; | |
2411 <STDIN> if !exists $fields[$motifcord+3]; | |
2412 my $currpos = length($microsat)+$intervalposs[$nikaal]; | |
2413 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; | |
2414 $fields[$motifcord+4] = $fields[$motifcord+4]+1; | |
2415 | |
2416 } | |
2417 | |
2418 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} | |
2419 | |
2420 ##print "new fields 16 = $fields[16]\n"; | |
2421 | |
2422 ##print "new fields 17 = $fields[17]\n"; | |
2423 my $returnline = join("\t",@fields); | |
2424 my $pastline = $returnline; | |
2425 if ($fields[$microsatcord] =~ /\[/){ | |
2426 $returnline = multiSpecies_compoundClarifyer_merge($returnline); | |
2427 } | |
2428 #print "finally right-extended line = ",$returnline,"\n"; | |
2429 return $returnline; | |
2430 } | |
2431 sub longest_array_element{ | |
2432 my $counter = 0; | |
2433 my($max) = shift(@_); | |
2434 my $maxcounter = 0; | |
2435 foreach my $temp (@_) { | |
2436 $counter++; | |
2437 #print "finding largest array: $maxcounter \n" if $prinkter == 1; | |
2438 if(length($temp) > length($max)){ | |
2439 $max = $temp; | |
2440 $maxcounter = $counter; | |
2441 } | |
2442 } | |
2443 return($maxcounter); | |
2444 } | |
2445 sub shortest_array_element{ | |
2446 my $counter = 0; | |
2447 my($min) = shift(@_); | |
2448 my $mincounter = 0; | |
2449 foreach my $temp (@_) { | |
2450 $counter++; | |
2451 #print "finding largest array: $mincounter \n" if $prinkter == 1; | |
2452 if(length($temp) < length($min)){ | |
2453 $min = $temp; | |
2454 $mincounter = $counter; | |
2455 } | |
2456 } | |
2457 return($mincounter); | |
2458 } | |
2459 | |
2460 | |
2461 sub left_extention_permission_giver{ | |
2462 my @fields = split(/\t/,$_[0]); | |
2463 my $microsat = $fields[$microsatcord]; | |
2464 $microsat =~ s/(^\[)|-//g; | |
2465 my $motif = $fields[$motifcord]; | |
2466 my $firstmotif = (); | |
2467 my $firststretch = (); | |
2468 my @stretches=(); | |
2469 if ($motif =~ /^\[/){ | |
2470 $motif =~ s/^\[//g; | |
2471 $motif =~ /([a-zA-Z]+)\].*/; | |
2472 $firstmotif = $1; | |
2473 @stretches = split(/\]/,$microsat); | |
2474 $firststretch = $stretches[0]; | |
2475 ##print "firststretch = $firststretch\n"; | |
2476 } | |
2477 else {$firstmotif = $motif;$firststretch = $microsat;} | |
2478 | |
2479 if (length($firststretch) < $thresholds[length($firstmotif)]){ | |
2480 return "no"; | |
2481 } | |
2482 else {return "yes";} | |
2483 | |
2484 } | |
2485 sub right_extention_permission_giver{ | |
2486 my @fields = split(/\t/,$_[0]); | |
2487 my $microsat = $fields[$microsatcord]; | |
2488 $microsat =~ s/-|(\]$)//sg; | |
2489 my $motif = $fields[$motifcord]; | |
2490 my $temp_lastmotif = (); | |
2491 my $laststretch = (); | |
2492 my @stretches=(); | |
2493 | |
2494 | |
2495 if ($motif =~ /\]/){ | |
2496 $motif =~ s/\]$//gs; | |
2497 $motif =~ /.*\[([a-zA-Z]+)$/; | |
2498 $temp_lastmotif = $1; | |
2499 @stretches = split(/\[/,$microsat); | |
2500 $laststretch = pop(@stretches); | |
2501 ##print "last stretch = $laststretch\n"; | |
2502 } | |
2503 else {$temp_lastmotif = $motif; $laststretch = $microsat;} | |
2504 | |
2505 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ | |
2506 return "no"; | |
2507 } | |
2508 else { return "yes";} | |
2509 | |
2510 | |
2511 } | |
2512 sub multiSpecies_compoundClarifyer_merge{ | |
2513 my $line = $_[0]; | |
2514 #print "sent for mering: $line \n"; | |
2515 my @mields = split(/\t/,$line); | |
2516 my @fields = @mields; | |
2517 my $microsat = $fields[$microsatcord]; | |
2518 my $motifline = $fields[$motifcord]; | |
2519 my $microsatcopy = $microsat; | |
2520 $microsatcopy =~ s/^\[|\]$//sg; | |
2521 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); | |
2522 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); | |
2523 shift @inields; | |
2524 #print "inields =@inields<\n"; | |
2525 $motifline =~ s/^\[|\]$//sg; | |
2526 my @motields = split(/\]\[/,$motifline); | |
2527 my @firstmotifs = (); | |
2528 my @lastmotifs = (); | |
2529 for my $i (0 ... $#microields){ | |
2530 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); | |
2531 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); | |
2532 } | |
2533 #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n"; | |
2534 my @mergelist = (); | |
2535 my @inter_poses = split(/,/,$fields[$interr_poscord]); | |
2536 my $no_of_interruptions = $fields[$no_of_interruptionscord]; | |
2537 my @interruptions = split(/,/,$fields[$interrcord]); | |
2538 my @interrtypes = split(/,/,$fields[$interrtypecord]); | |
2539 my $stopper = 0; | |
2540 for my $i (0 ... $#motields-1){ | |
2541 #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n"; | |
2542 if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){ | |
2543 $stopper = 1; | |
2544 push(@mergelist, ($i)."_".($i+1)); | |
2545 } | |
2546 } | |
2547 | |
2548 return $line if scalar(@mergelist) == 0; | |
2549 | |
2550 foreach my $merging (@mergelist){ | |
2551 my @sets = split(/_/, $merging); | |
2552 my @tempmicro = (); | |
2553 my @tempmot = (); | |
2554 for my $i (0 ... $sets[0]-1){ | |
2555 push(@tempmicro, "[".$microields[$i]."]"); | |
2556 push(@tempmicro, $inields[$i]); | |
2557 push(@tempmot, "[".$motields[$i]."]"); | |
2558 #print "adding pre-motifs number $i\n"; | |
2559 } | |
2560 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; | |
2561 push (@tempmicro, $pusher); | |
2562 push(@tempmot, "[".$motields[$sets[0]]."]"); | |
2563 my $outcoming = -2; | |
2564 for my $i ($sets[1]+1 ... $#microields-1){ | |
2565 push(@tempmicro, "[".$microields[$i]."]"); | |
2566 push(@tempmicro, $inields[$i]); | |
2567 push(@tempmot, "[".$motields[$i]."]"); | |
2568 #print "adding post-motifs number $i\n"; | |
2569 $outcoming = $i; | |
2570 } | |
2571 if ($outcoming != -2){ | |
2572 #print "outcoming = $outcoming \n"; | |
2573 push(@tempmicro, "[".$microields[$outcoming+1 ]."]"); | |
2574 push(@tempmot,"[". $motields[$outcoming+1]."]"); | |
2575 } | |
2576 $fields[$microsatcord] = join("",@tempmicro); | |
2577 $fields[$motifcord] = join("",@tempmot); | |
2578 | |
2579 splice(@interrtypes, $sets[0], 1); | |
2580 $fields[$interrtypecord] = join(",",@interrtypes); | |
2581 splice(@interruptions, $sets[0], 1); | |
2582 $fields[$interrcord] = join(",",@interruptions); | |
2583 splice(@inter_poses, $sets[0], 1); | |
2584 $fields[$interr_poscord] = join(",",@inter_poses); | |
2585 $no_of_interruptions = $no_of_interruptions - 1; | |
2586 } | |
2587 | |
2588 if ($no_of_interruptions == 0){ | |
2589 $fields[$microsatcord] =~ s/^\[|\]$//sg; | |
2590 $fields[$motifcord] =~ s/^\[|\]$//sg; | |
2591 $line = join("\t", @fields[0 ... $motifcord]); | |
2592 } | |
2593 else{ | |
2594 $line = join("\t", @fields); | |
2595 } | |
2596 return $line; | |
2597 } | |
2598 | |
2599 sub thrashallow{ | |
2600 my $motif = $_[0]; | |
2601 return 4 if length($motif) == 2; | |
2602 return 6 if length($motif) == 3; | |
2603 return 8 if length($motif) == 4; | |
2604 | |
2605 } | |
2606 | |
2607 #xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx | |
2608 | |
2609 | |
2610 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx | |
2611 sub multispecies_filtering_compound_microsats{ | |
2612 my $unfiltered = $_[0]; | |
2613 my $filtered = $_[1]; | |
2614 my $residue = $_[2]; | |
2615 my $no_of_species = $_[5]; | |
2616 open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; | |
2617 open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; | |
2618 open(RES,">$residue") or die "Cannot open file $residue: $!"; | |
2619 | |
2620 $infocord = 2 + (4*$no_of_species) - 1; | |
2621 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
2622 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
2623 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
2624 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
2625 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
2626 | |
2627 my @sub_thresholds = ("0"); | |
2628 push(@sub_thresholds, split(/_/,$_[3])); | |
2629 my @thresholds = ("0"); | |
2630 push(@thresholds, split(/_/,$_[4])); | |
2631 | |
2632 while (my $line = <UNF>) { | |
2633 if ($line !~ /compound/){ | |
2634 print FIL $line,"\n"; next; | |
2635 } | |
2636 chomp $line; | |
2637 my @fields = split(/\t/,$line); | |
2638 my $motifline = $fields[$motifcord]; | |
2639 $motifline =~ s/^\[|\]$//g; | |
2640 my @motifs = split(/\]\[/,$motifline); | |
2641 my $microsat = $fields[$microsatcord]; | |
2642 $microsat =~ s/^\[|\]$|-//g; | |
2643 my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); | |
2644 | |
2645 my $stopper = 0; | |
2646 for my $i (0 ... $#motifs){ | |
2647 my @common = (); | |
2648 my $probe = $motifs[$i].$motifs[$i]; | |
2649 my $motif_size = length($motifs[$i]); | |
2650 | |
2651 for my $j (0 ... $#motifs){ | |
2652 next if length($motifs[$i]) != length($motifs[$j]); | |
2653 push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i; | |
2654 } | |
2655 | |
2656 if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;} | |
2657 else {next;} | |
2658 } | |
2659 | |
2660 if ($stopper == 1){ | |
2661 print RES $line,"\n"; | |
2662 } | |
2663 else { print FIL $line,"\n"; } | |
2664 } | |
2665 close FIL; | |
2666 close RES; | |
2667 } | |
2668 | |
2669 #xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx | |
2670 | |
2671 | |
2672 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx | |
2673 | |
2674 sub chromosome_unrand_breaker{ | |
2675 # print "IN chromosome_unrand_breaker: @_\n "; | |
2676 my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" | |
2677 my $dir = $_[1]; ###### directory where subsets are put | |
2678 my $output2 = $_[2]; ###### list of subset files | |
2679 my $increment = $_[3]; | |
2680 my $info = $_[4]; | |
2681 my $chr = $_[5]; | |
2682 open(SEQ,"<$input1") or die "Cannot open file $input1 $!"; | |
2683 | |
2684 open(OUT,">$output2") or die "Cannot open file $output2 $!"; | |
2685 | |
2686 #--------------------------------------------------------------------------------------------------- | |
2687 # NOW READING THE SEQUENCE FILE | |
2688 | |
2689 my $seed = 0; | |
2690 my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); | |
2691 print OUT $subset,"\n"; | |
2692 open(SUB,">$subset"); | |
2693 | |
2694 while(my $sine = <SEQ>){ | |
2695 $seed++; | |
2696 print SUB $sine; | |
2697 | |
2698 if ($seed%$increment == 0 ){ | |
2699 close SUB; | |
2700 $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); | |
2701 open(SUB,">$subset"); | |
2702 print SUB $sine; | |
2703 print OUT $subset,"\n"; | |
2704 # print $subset,"\n"; | |
2705 } | |
2706 } | |
2707 close OUT; | |
2708 close SUB; | |
2709 } | |
2710 #xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx | |
2711 | |
2712 | |
2713 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx | |
2714 sub multiSpecies_interruptedMicrosatHunter{ | |
2715 # print "IN multiSpecies_interruptedMicrosatHunter: @_\n"; | |
2716 my $input1 = $_[0]; ###### the *_sput_op4_ii file | |
2717 my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" | |
2718 my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format | |
2719 my $output2 = $_[3]; ###### uninterrupted microsatellite file | |
2720 my $org = $_[4]; | |
2721 my $no_of_species = $_[5]; | |
2722 | |
2723 my @thresholds = "0"; | |
2724 push(@thresholds, split(/_/,$_[6])); | |
2725 | |
2726 # print "thresholds = @thresholds \n"; | |
2727 $infocord = 2 + (4*$no_of_species) - 1; | |
2728 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
2729 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
2730 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
2731 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
2732 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
2733 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
2734 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
2735 | |
2736 $interr_poscord = $motifcord + 3; | |
2737 $no_of_interruptionscord = $motifcord + 4; | |
2738 $interrcord = $motifcord + 2; | |
2739 $interrtypecord = $motifcord + 1; | |
2740 | |
2741 | |
2742 $prinkter = 0; | |
2743 # print "prionkytet = $prinkter\n"; | |
2744 | |
2745 open(IN,"<$input1") or die "Cannot open file $input1 $!"; | |
2746 open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; | |
2747 | |
2748 open(INT,">$output1") or die "Cannot open file $output2 $!"; | |
2749 open(UNINT,">$output2") or die "Cannot open file $output2 $!"; | |
2750 | |
2751 # print "opened files !!\n"; | |
2752 my $linecounter = 0; | |
2753 my $microcounter = 0; | |
2754 | |
2755 my %micros = (); | |
2756 while (my $line = <IN>){ | |
2757 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; | |
2758 $linecounter++; | |
2759 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) { | |
2760 my $key = join("\t",$1, $2, $3, $4, $5); | |
2761 # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1; | |
2762 push (@{$micros{$key}},$line); | |
2763 $microcounter++; | |
2764 } | |
2765 else {#print $line if $prinkter == 1; | |
2766 } | |
2767 } | |
2768 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
2769 close IN; | |
2770 my @deletedlines = (); | |
2771 # print "done hash \n"; | |
2772 $linecounter = 0; | |
2773 #--------------------------------------------------------------------------------------------------- | |
2774 # NOW READING THE SEQUENCE FILE | |
2775 while(my $sine = <SEQ>){ | |
2776 #print $linecounter,"\n" if $linecounter % 1000 == 0; | |
2777 my %microstart=(); | |
2778 my %microend=(); | |
2779 my @sields = split(/\t/,$sine); | |
2780 my $key = (); | |
2781 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
2782 $key = join("\t",$1, $2, $3, $4, $5); | |
2783 # print $key, "<-<-<-<-<-<-<-<\n"; | |
2784 } | |
2785 | |
2786 # $prinkter = 1 if $sine =~ /^>H\t499\t/; | |
2787 | |
2788 if (exists $micros{$key}){ | |
2789 my @microstring = @{$micros{$key}}; | |
2790 delete $micros{$key}; | |
2791 my @filteredmicrostring; | |
2792 # print "sequence = $sields[$sequencepos]" if $prinkter == 1; | |
2793 foreach my $line (@microstring){ | |
2794 $linecounter++; | |
2795 my $copy_line = $line; | |
2796 my @fields = split(/\t/,$line); | |
2797 my $start = $fields[$startcord]; | |
2798 my $end = $fields[$endcord]; | |
2799 | |
2800 # print $line if $prinkter == 1; | |
2801 #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE | |
2802 my $newline; | |
2803 while(1){ | |
2804 # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1; | |
2805 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} | |
2806 | |
2807 $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org); | |
2808 if ($newline eq $line){$line = $newline; last;} | |
2809 else {$line = $newline;} | |
2810 | |
2811 if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} | |
2812 # print "returned line from left extender= $line \n" if $prinkter == 1; | |
2813 } | |
2814 while(1){ | |
2815 # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1; | |
2816 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} | |
2817 | |
2818 $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org); | |
2819 if ($newline eq $line){$line = $newline; last;} | |
2820 else {$line = $newline;} | |
2821 | |
2822 if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} | |
2823 # print "returned line from right extender= $line \n" if $prinkter == 1; | |
2824 } | |
2825 # print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1; | |
2826 | |
2827 my @tempfields = split(/\t/,$line); | |
2828 if ($tempfields[$microsatcord] =~ /\[/){ | |
2829 print INT $line,"\n"; | |
2830 } | |
2831 else{ | |
2832 print UNINT $line,"\n"; | |
2833 } | |
2834 | |
2835 if ($line =~ /NULL/){ next; } | |
2836 push(@filteredmicrostring, $line); | |
2837 push (@{$microstart{$start}},$line); | |
2838 push (@{$microend{$end}},$line); | |
2839 } | |
2840 | |
2841 my $firstflag = 'down'; | |
2842 | |
2843 } #if (exists $micros{$key}){ | |
2844 } | |
2845 close INT; | |
2846 close UNINT; | |
2847 # print "final number of lines = $linecounter\n"; | |
2848 } | |
2849 | |
2850 sub multiSpecies_interruptedMicrosatHunter_left_extender{ | |
2851 my ($line, $seq, $org) = @_; | |
2852 # print "left extender, like passed = $line\n" if $prinkter == 1; | |
2853 # print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1; | |
2854 chomp $line; | |
2855 my @fields = split(/\t/,$line); | |
2856 my $rstart = $fields[$startcord]; | |
2857 my $microsat = $fields[$microsatcord]; | |
2858 $microsat =~ s/\[|\]//g; | |
2859 my $rend = $rstart + length($microsat)-1; | |
2860 $microsat =~ s/-//g; | |
2861 my $motif = $fields[$motifcord]; | |
2862 my $firstmotif = (); | |
2863 | |
2864 if ($motif =~ /^\[/){ | |
2865 $motif =~ s/^\[//g; | |
2866 $motif =~ /([a-zA-Z]+)\].*/; | |
2867 $firstmotif = $1; | |
2868 } | |
2869 else {$firstmotif = $motif;} | |
2870 | |
2871 # print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1; | |
2872 my $leftphase = substr($microsat, 0,length($firstmotif)); | |
2873 my $phaser = $leftphase.$leftphase; | |
2874 my @phase = split(/\s*/,$leftphase); | |
2875 my @phases; | |
2876 my @copy_phases = @phases; | |
2877 my $crawler=0; | |
2878 for (0 ... (length($leftphase)-1)){ | |
2879 push(@phases, substr($phaser, $crawler, length($leftphase))); | |
2880 $crawler++; | |
2881 } | |
2882 | |
2883 my $start = $rstart; | |
2884 my $end = $rend; | |
2885 | |
2886 my $leftseq = substr($seq, 0, $start); | |
2887 # print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1; | |
2888 my @extentions = (); | |
2889 my @trappeds = (); | |
2890 my @intervalposs = (); | |
2891 my @trappedposs = (); | |
2892 my @trappedphases = (); | |
2893 my @intervals = (); | |
2894 my $firstmotif_length = length($firstmotif); | |
2895 foreach my $phase (@phases){ | |
2896 # print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1; | |
2897 # print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1; | |
2898 if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ | |
2899 # print "in left pattern\n" if $prinkter == 1; | |
2900 my $trapped = $1; | |
2901 my $trappedpos = length($leftseq)-length($trapped); | |
2902 my $interval = $3; | |
2903 my $intervalpos = index($trapped, $interval) + 1; | |
2904 # print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1; | |
2905 | |
2906 my $extention = substr($trapped, 0, length($trapped)-length($interval)); | |
2907 my $leftpeep = substr($seq, 0, ($start-length($trapped))); | |
2908 my @passed_overhangs; | |
2909 | |
2910 for my $i (1 ... length($phase)-1){ | |
2911 my $overhang = substr($phase, -length($phase)+$i); | |
2912 # print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1; | |
2913 #TEMPORARY... BETTER METHOD NEEDED | |
2914 $leftpeep =~ s/-//g; | |
2915 if ($leftpeep =~ /$overhang$/i){ | |
2916 push(@passed_overhangs,$overhang); | |
2917 # print "l overhang\n" if $prinkter == 1; | |
2918 } | |
2919 } | |
2920 | |
2921 if(scalar(@passed_overhangs)>0){ | |
2922 my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; | |
2923 $extention = $overhang.$extention; | |
2924 $trapped = $overhang.$trapped; | |
2925 # print "trapped extended to $trapped \n" if $prinkter == 1; | |
2926 $trappedpos = length($leftseq)-length($trapped); | |
2927 } | |
2928 | |
2929 push(@extentions,$extention); | |
2930 # print "extentions = @extentions \n" if $prinkter == 1; | |
2931 | |
2932 push(@trappeds,$trapped ); | |
2933 push(@intervalposs,length($extention)+1); | |
2934 push(@trappedposs, $trappedpos); | |
2935 # print "trappeds = @trappeds\n" if $prinkter == 1; | |
2936 push(@trappedphases, substr($extention,0,length($phase))); | |
2937 push(@intervals, $interval); | |
2938 } | |
2939 } | |
2940 if (scalar(@trappeds == 0)) {return $line;} | |
2941 | |
2942 ############################ my $nikaal = longest_array_element(@trappeds); | |
2943 my $nikaal = shortest_array_element(@intervals); | |
2944 | |
2945 # print "longest element found = $nikaal \n" if $prinkter == 1; | |
2946 | |
2947 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} | |
2948 $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; | |
2949 #print "new fields 9 = $fields[9]\n" if $prinkter == 1; | |
2950 $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); | |
2951 | |
2952 #print "new fields 9 = $fields[9]\n" if $prinkter == 1; | |
2953 | |
2954 if($fields[$microsatcord] !~ /^\[/i){ | |
2955 $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; | |
2956 } | |
2957 | |
2958 $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; | |
2959 #print "new fields 14 = $fields[12]\n" if $prinkter == 1; | |
2960 | |
2961 #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1; | |
2962 | |
2963 | |
2964 if (scalar(@fields) > $motifcord+1){ | |
2965 $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; | |
2966 } | |
2967 else{$fields[$motifcord+1] = "indel/deletion";} | |
2968 #print "new fields 14 = $fields[14]\n" if $prinkter == 1; | |
2969 | |
2970 if (scalar(@fields)>$motifcord+2){ | |
2971 $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; | |
2972 } | |
2973 else{$fields[$motifcord+2] = $intervals[$nikaal];} | |
2974 #print "new fields 15 = $fields[15]\n" if $prinkter == 1; | |
2975 | |
2976 my @seventeen=(); | |
2977 | |
2978 if (scalar(@fields)>$motifcord+3){ | |
2979 @seventeen = split(/,/,$fields[$motifcord+3]); | |
2980 # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1; | |
2981 for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} | |
2982 $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); | |
2983 $fields[$motifcord+4] = $fields[$motifcord+4]+1; | |
2984 } | |
2985 | |
2986 else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} | |
2987 | |
2988 #print "new fields 16 = $fields[16]\n" if $prinkter == 1; | |
2989 #print "new fields 17 = $fields[17]\n" if $prinkter == 1; | |
2990 | |
2991 # return join("\t",@fields); | |
2992 my $returnline = join("\t",@fields); | |
2993 my $pastline = $returnline; | |
2994 if ($fields[$microsatcord] =~ /\[/){ | |
2995 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); | |
2996 } | |
2997 # print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1; | |
2998 return $returnline; | |
2999 } | |
3000 | |
3001 sub multiSpecies_interruptedMicrosatHunter_right_extender{ | |
3002 # print "right extender\n" if $prinkter == 1; | |
3003 my ($line, $seq, $org) = @_; | |
3004 # print "in right extender... line passed = $line\n" if $prinkter == 1; | |
3005 # print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1; | |
3006 chomp $line; | |
3007 my @fields = split(/\t/,$line); | |
3008 my $rstart = $fields[$startcord]; | |
3009 my $microsat = $fields[$microsatcord]; | |
3010 $microsat =~ s/\[|\]//g; | |
3011 my $rend = $rstart + length($microsat)-1; | |
3012 $microsat =~ s/-//g; | |
3013 my $motif = $fields[$motifcord]; | |
3014 my $temp_lastmotif = (); | |
3015 | |
3016 if ($motif =~ /\]$/){ | |
3017 $motif =~ s/\]$//g; | |
3018 $motif =~ /.*\[([a-zA-Z]+)/; | |
3019 $temp_lastmotif = $1; | |
3020 } | |
3021 else {$temp_lastmotif = $motif;} | |
3022 my $lastmotif = substr($microsat,-length($temp_lastmotif)); | |
3023 # print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1; | |
3024 my $rightphase = substr($microsat, -length($lastmotif)); | |
3025 my $phaser = $rightphase.$rightphase; | |
3026 my @phase = split(/\s*/,$rightphase); | |
3027 my @phases; | |
3028 my @copy_phases = @phases; | |
3029 my $crawler=0; | |
3030 for (0 ... (length($rightphase)-1)){ | |
3031 push(@phases, substr($phaser, $crawler, length($rightphase))); | |
3032 $crawler++; | |
3033 } | |
3034 | |
3035 my $start = $rstart; | |
3036 my $end = $rend; | |
3037 | |
3038 my $rightseq = substr($seq, $end+1); | |
3039 # print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1; | |
3040 # print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1; | |
3041 my @extentions = (); | |
3042 my @trappeds = (); | |
3043 my @intervalposs = (); | |
3044 my @trappedposs = (); | |
3045 my @trappedphases = (); | |
3046 my @intervals = (); | |
3047 my $lastmotif_length = length($lastmotif); | |
3048 foreach my $phase (@phases){ | |
3049 # print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1; | |
3050 # print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1; | |
3051 if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ | |
3052 # print "in right pattern\n" if $prinkter == 1; | |
3053 my $trapped = $1; | |
3054 my $trappedpos = $end+1; | |
3055 my $interval = $2; | |
3056 my $intervalpos = index($trapped, $interval) + 1; | |
3057 # print "trapped = $trapped, interval = $interval\n" if $prinkter == 1; | |
3058 | |
3059 my $extention = substr($trapped, length($interval)); | |
3060 my $rightpeep = substr($seq, ($end+length($trapped))+1); | |
3061 my @passed_overhangs = ""; | |
3062 | |
3063 #TEMPORARY... BETTER METHOD NEEDED | |
3064 $rightpeep =~ s/-//g; | |
3065 | |
3066 for my $i (1 ... length($phase)-1){ | |
3067 my $overhang = substr($phase,0, $i); | |
3068 # print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1; | |
3069 if ($rightpeep =~ /^$overhang/i){ | |
3070 push(@passed_overhangs, $overhang); | |
3071 # print "r overhang\n" if $prinkter == 1; | |
3072 } | |
3073 } | |
3074 if (scalar(@passed_overhangs) > 0){ | |
3075 my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; | |
3076 $extention = $extention.$overhang; | |
3077 $trapped = $trapped.$overhang; | |
3078 # print "trapped extended to $trapped \n" if $prinkter == 1; | |
3079 } | |
3080 | |
3081 push(@extentions,$extention); | |
3082 #print "extentions = @extentions \n" if $prinkter == 1; | |
3083 | |
3084 push(@trappeds,$trapped ); | |
3085 push(@intervalposs,$intervalpos); | |
3086 push(@trappedposs, $trappedpos); | |
3087 # print "trappeds = @trappeds\n" if $prinkter == 1; | |
3088 push(@trappedphases, substr($extention,0,length($phase))); | |
3089 push(@intervals, $interval); | |
3090 } | |
3091 } | |
3092 if (scalar(@trappeds == 0)) {return $line;} | |
3093 | |
3094 ################################### my $nikaal = longest_array_element(@trappeds); | |
3095 my $nikaal = shortest_array_element(@intervals); | |
3096 | |
3097 # print "longest element found = $nikaal \n" if $prinkter == 1; | |
3098 | |
3099 if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} | |
3100 $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; | |
3101 $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); | |
3102 | |
3103 | |
3104 if($fields[$microsatcord] !~ /^\[/i){ | |
3105 $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; | |
3106 } | |
3107 | |
3108 $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; | |
3109 | |
3110 | |
3111 if (scalar(@fields) > $motifcord+1){ | |
3112 $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; | |
3113 } | |
3114 else{$fields[$motifcord+1] = "indel/deletion";} | |
3115 | |
3116 if (scalar(@fields)>$motifcord+2){ | |
3117 $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; | |
3118 } | |
3119 else{$fields[$motifcord+2] = $intervals[$nikaal];} | |
3120 | |
3121 my @seventeen=(); | |
3122 if (scalar(@fields)>$motifcord+3){ | |
3123 #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1; | |
3124 my $currpos = length($microsat)+$intervalposs[$nikaal]; | |
3125 $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; | |
3126 $fields[$motifcord+4] = $fields[$motifcord+4]+1; | |
3127 | |
3128 } | |
3129 | |
3130 else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} | |
3131 | |
3132 # print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1; | |
3133 # return join("\t",@fields); | |
3134 | |
3135 my $returnline = join("\t",@fields); | |
3136 my $pastline = $returnline; | |
3137 if ($fields[$microsatcord] =~ /\[/){ | |
3138 $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); | |
3139 } | |
3140 # print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1; | |
3141 return $returnline; | |
3142 | |
3143 } | |
3144 | |
3145 sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{ | |
3146 my @fields = split(/\t/,$_[0]); | |
3147 my $microsat = $fields[$microsatcord]; | |
3148 $microsat =~ s/(^\[)|-//sg; | |
3149 my $motif = $fields[$motifcord]; | |
3150 chomp $motif; | |
3151 # print $motif, "\n" if $motif !~ /^\[/; | |
3152 my $firstmotif = (); | |
3153 my $firststretch = (); | |
3154 my @stretches=(); | |
3155 | |
3156 # print "motif = $motif, microsat = $microsat\n" if $prinkter == 1; | |
3157 if ($motif =~ /^\[/){ | |
3158 $motif =~ s/^\[//sg; | |
3159 $motif =~ /([a-zA-Z]+)\].*/; | |
3160 $firstmotif = $1; | |
3161 @stretches = split(/\]/,$microsat); | |
3162 $firststretch = $stretches[0]; | |
3163 #print "firststretch = $firststretch\n" if $prinkter == 1; | |
3164 } | |
3165 else {$firstmotif = $motif;$firststretch = $microsat;} | |
3166 # print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1; | |
3167 if (length($firststretch) < $thresholds[length($firstmotif)]){ | |
3168 return "no"; | |
3169 } | |
3170 else {return "yes";} | |
3171 | |
3172 } | |
3173 sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{ | |
3174 my @fields = split(/\t/,$_[0]); | |
3175 my $microsat = $fields[$microsatcord]; | |
3176 $microsat =~ s/-|(\]$)//sg; | |
3177 my $motif = $fields[$motifcord]; | |
3178 chomp $motif; | |
3179 my $temp_lastmotif = (); | |
3180 my $laststretch = (); | |
3181 my @stretches=(); | |
3182 | |
3183 | |
3184 if ($motif =~ /\]/){ | |
3185 $motif =~ s/\]$//sg; | |
3186 $motif =~ /.*\[([a-zA-Z]+)$/; | |
3187 $temp_lastmotif = $1; | |
3188 @stretches = split(/\[/,$microsat); | |
3189 $laststretch = pop(@stretches); | |
3190 #print "last stretch = $laststretch\n" if $prinkter == 1; | |
3191 } | |
3192 else {$temp_lastmotif = $motif; $laststretch = $microsat;} | |
3193 | |
3194 if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ | |
3195 return "no"; | |
3196 } | |
3197 else { return "yes";} | |
3198 | |
3199 | |
3200 } | |
3201 sub checking_substitutions{ | |
3202 | |
3203 my ($line, $seq, $startprobes, $endprobes) = @_; | |
3204 #print "sequence = $seq \n" if $prinkter == 1; | |
3205 #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n"; | |
3206 # <STDIN>; | |
3207 my @seqarray = split(/\s*/,$seq); | |
3208 my @startsubst_probes = split(/\|/,$startprobes); | |
3209 my @endsubst_probes = split(/\|/,$endprobes); | |
3210 chomp $line; | |
3211 my @fields = split(/\t/,$line); | |
3212 my $start = $fields[11] - $fields[10]; | |
3213 my $end = $fields[13] - $fields[10]; | |
3214 my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9] | |
3215 $motif =~ s/\[|\]//g; | |
3216 my $microsat = $fields[14]; | |
3217 $microsat =~ s/\[|\]//g; | |
3218 #------------------------------------------------------------------------ | |
3219 # GETTING START AND END PHASES | |
3220 my $startphase = substr($microsat,0, length($motif)); | |
3221 my $endphase = substr($microsat,-length($motif), length($motif)); | |
3222 #print "start and end phases are - $startphase and $endphase\n"; | |
3223 my $startflag = 'down'; | |
3224 my $endflag = 'down'; | |
3225 my $substitution_distance = length($motif); | |
3226 my $prestart = $start - $substitution_distance; | |
3227 my $postend = $end + $substitution_distance; | |
3228 my @endadds = (); | |
3229 my @startadds = (); | |
3230 if (($prestart < 0) || ($postend > scalar(@seqarray))) { | |
3231 last; | |
3232 } | |
3233 #------------------------------------------------------------------------#------------------------------------------------------------------------ | |
3234 # CHECKING FOR SUBSTITUTION PROBES NOW | |
3235 | |
3236 if ($fields[8] ne "mononucleotide"){ | |
3237 while ($startflag eq "down"){ | |
3238 my $search = join("",@seqarray[$prestart...($start-1)]); | |
3239 #print "search is from $prestart...($start-1) = $search\n"; | |
3240 foreach my $probe (@startsubst_probes){ | |
3241 #print "\t\tprobe = $probe\n"; | |
3242 if ($search =~ /^$probe/){ | |
3243 #print "\tfound addition to the left - $search \n"; | |
3244 my $copyprobe = $probe; | |
3245 my $type; | |
3246 my $subspos = 0; | |
3247 my $interruption = ""; | |
3248 if ($search eq $startphase) { $type = "NONE";} | |
3249 else{ | |
3250 $copyprobe =~ s/\[a-zA-Z\]/^/g; | |
3251 $subspos = index($copyprobe,"^") + 1; | |
3252 $type = "substitution"; | |
3253 $interruption = substr($search, $subspos,1); | |
3254 } | |
3255 my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos); | |
3256 #print "adding information: $addinfo \n"; | |
3257 push(@startadds, $addinfo); | |
3258 $prestart = $prestart - $substitution_distance; | |
3259 $start = $start-$substitution_distance; | |
3260 $startflag = 'down'; | |
3261 | |
3262 last; | |
3263 } | |
3264 else{ | |
3265 $startflag = 'up'; | |
3266 } | |
3267 } | |
3268 } | |
3269 #<STDIN>; | |
3270 while ($endflag eq "down"){ | |
3271 my $search = join("",@seqarray[($end+1)...$postend]); | |
3272 #print "search is from ($end+1)...$postend] = $search\n"; | |
3273 | |
3274 foreach my $probe (@endsubst_probes){ | |
3275 #print "\t\tprobe = $probe\n"; | |
3276 if ($search =~ /$probe$/){ | |
3277 my $copyprobe = $probe; | |
3278 my $type; | |
3279 my $subspos = 0; | |
3280 my $interruption = ""; | |
3281 if ($search eq $endphase) { $type = "NONE";} | |
3282 else{ | |
3283 $copyprobe =~ s/\[a-zA-Z\]/^/g; | |
3284 $subspos = index($copyprobe,"^") + 1; | |
3285 $type = "substitution"; | |
3286 $interruption = substr($search, $subspos,1); | |
3287 } | |
3288 my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos); | |
3289 #print "adding information: $addinfo \n"; | |
3290 push(@endadds, $addinfo); | |
3291 $postend = $postend + $substitution_distance; | |
3292 $end = $end+$substitution_distance; | |
3293 push(@endadds, $search); | |
3294 $endflag = 'down'; | |
3295 last; | |
3296 } | |
3297 else{ | |
3298 $endflag = 'up'; | |
3299 } | |
3300 } | |
3301 } | |
3302 #print "startadds = @startadds, endadds = @endadds \n"; | |
3303 | |
3304 } | |
3305 } | |
3306 sub microsat_packer{ | |
3307 my $microsat = $_[0]; | |
3308 my $addition = $_[1]; | |
3309 | |
3310 | |
3311 | |
3312 } | |
3313 sub multiSpecies_interruptedMicrosatHunter_merge{ | |
3314 $prinkter = 0; | |
3315 # print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n"; | |
3316 my $line = $_[0]; | |
3317 # print "sent for mering: $line \n" if $prinkter ==1; | |
3318 my @mields = split(/\t/,$line); | |
3319 my @fields = @mields; | |
3320 my $microsat = allCaps($fields[$microsatcord]); | |
3321 my $motifline = allCaps($fields[$motifcord]); | |
3322 my $microsatcopy = $microsat; | |
3323 # print "microsat = $microsat|\n" if $prinkter ==1; | |
3324 $microsatcopy =~ s/^\[|\]$//sg; | |
3325 chomp $microsatcopy; | |
3326 my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); | |
3327 my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); | |
3328 shift @inields; | |
3329 # print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1; | |
3330 $motifline =~ s/^\[|\]$//sg; | |
3331 my @motields = split(/\]\[/,$motifline); | |
3332 my @firstmotifs = (); | |
3333 my @lastmotifs = (); | |
3334 for my $i (0 ... $#microields){ | |
3335 $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); | |
3336 $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); | |
3337 } | |
3338 # print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1; | |
3339 my @mergelist = (); | |
3340 my @inter_poses = split(/,/,$fields[$interr_poscord]); | |
3341 my $no_of_interruptions = $fields[$no_of_interruptionscord]; | |
3342 my @interruptions = split(/,/,$fields[$interrcord]); | |
3343 my @interrtypes = split(/,/,$fields[$interrtypecord]); | |
3344 my $stopper = 0; | |
3345 for my $i (0 ... $#motields-1){ | |
3346 # print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1; | |
3347 if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){ | |
3348 $stopper = 1; | |
3349 push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1; | |
3350 } | |
3351 } | |
3352 | |
3353 # print "mergelist = @mergelist\n" if $prinkter ==1; | |
3354 return $line if scalar(@mergelist) == 0; | |
3355 # print "merging @mergelist\n" if $prinkter ==1; | |
3356 # <STDIN> if $prinkter ==1; | |
3357 | |
3358 foreach my $merging (@mergelist){ | |
3359 my @sets = split(/_/, $merging); | |
3360 # print "sets = @sets\n" if $prinkter ==1; | |
3361 my @tempmicro = (); | |
3362 my @tempmot = (); | |
3363 # print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1; | |
3364 for my $i (0 ... $sets[0]-1){ | |
3365 # print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1; | |
3366 push(@tempmicro, "[".$microields[$i]."]"); | |
3367 push(@tempmicro, $inields[$i]); | |
3368 push(@tempmot, "[".$motields[$i]."]"); | |
3369 # print "adding pre-motifs number $i\n" if $prinkter ==1; | |
3370 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; | |
3371 } | |
3372 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; | |
3373 # print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1; | |
3374 my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; | |
3375 # print "middle is, from @motields - @sets, number 0 which is is\n"; | |
3376 # print ": $motields[$sets[0]]\n"; | |
3377 push (@tempmicro, $pusher); | |
3378 push(@tempmot, "[".$motields[$sets[0]]."]"); | |
3379 push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]]; | |
3380 my $outcoming = -2; | |
3381 # print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; | |
3382 # print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1; | |
3383 for my $i ($sets[1]+1 ... $#microields){ | |
3384 # print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1; | |
3385 push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i]; | |
3386 push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i]; | |
3387 push(@tempmot, "[".$motields[$i]."]"); | |
3388 # print "adding post-motifs number $i\n" if $prinkter ==1; | |
3389 $outcoming = $i; | |
3390 } | |
3391 # print "____________________________________________________________________________\n"; | |
3392 $prinkter = 0; | |
3393 $fields[$microsatcord] = join("",@tempmicro); | |
3394 $fields[$motifcord] = join("",@tempmot); | |
3395 # print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1; | |
3396 | |
3397 splice(@interrtypes, $sets[0], 1); | |
3398 $fields[$interrtypecord] = join(",",@interrtypes); | |
3399 splice(@interruptions, $sets[0], 1); | |
3400 $fields[$interrcord] = join(",",@interruptions); | |
3401 splice(@inter_poses, $sets[0], 1); | |
3402 $fields[$interr_poscord] = join(",",@inter_poses); | |
3403 $no_of_interruptions = $no_of_interruptions - 1; | |
3404 } | |
3405 | |
3406 if ($no_of_interruptions == 0 && $line !~ /compound/){ | |
3407 $fields[$microsatcord] =~ s/^\[|\]$//sg; | |
3408 $fields[$motifcord] =~ s/^\[|\]$//sg; | |
3409 $line = join("\t", @fields[0 ... $motifcord]); | |
3410 } | |
3411 else{ | |
3412 $line = join("\t", @fields); | |
3413 } | |
3414 # print "post merging, the line is $line\n" if $prinkter ==1; | |
3415 #<STDIN> if $stopper ==1; | |
3416 return $line; | |
3417 } | |
3418 sub interval_asseser{ | |
3419 my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3]; | |
3420 } | |
3421 #--------------------------------------------------------------------------------------------------- | |
3422 sub allCaps{ | |
3423 my $motif = $_[0]; | |
3424 $motif =~ s/a/A/g; | |
3425 $motif =~ s/c/C/g; | |
3426 $motif =~ s/t/T/g; | |
3427 $motif =~ s/g/G/g; | |
3428 return $motif; | |
3429 } | |
3430 | |
3431 | |
3432 #xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx | |
3433 | |
3434 | |
3435 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx | |
3436 sub merge_interruptedMicrosats{ | |
3437 # print "IN merge_interruptedMicrosats: @_\n"; | |
3438 my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" | |
3439 my $input1 = $_[1]; ###### the *_sput_op4_ii file | |
3440 my $input2 = $_[2]; ###### the *_sput_op4_ii file | |
3441 $no_of_species = $_[3]; | |
3442 | |
3443 my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward | |
3444 my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse | |
3445 my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward | |
3446 #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse | |
3447 #my $info = $_[4]; | |
3448 #my @tags = split(/\t/,$info); | |
3449 | |
3450 open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; | |
3451 open(INF,"<$input1") or die "Cannot open file $input1 $!"; | |
3452 open(INR,"<$input2") or die "Cannot open file $input2 $!"; | |
3453 open(OUTF,">$output1") or die "Cannot open file $output1 $!"; | |
3454 open(OUTR,">$output2") or die "Cannot open file $output2 $!"; | |
3455 open(MER,">$output3") or die "Cannot open file $output3 $!"; | |
3456 #open(MERR,">$output4") or die "Cannot open file $output4 $!"; | |
3457 | |
3458 | |
3459 | |
3460 $printer = 0; | |
3461 | |
3462 # print "files opened \n"; | |
3463 $infocord = 2 + (4*$no_of_species) - 1; | |
3464 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
3465 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
3466 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
3467 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
3468 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
3469 $typecord = $infocord + 1; | |
3470 my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
3471 | |
3472 $interrtypecord = $motifcord + 1; | |
3473 $interrcord = $motifcord + 2; | |
3474 $interr_poscord = $motifcord + 3; | |
3475 $no_of_interruptionscord = $motifcord + 4; | |
3476 $mergestarts = $no_of_interruptionscord+ 1; | |
3477 $mergeends = $no_of_interruptionscord+ 2; | |
3478 $mergemicros = $no_of_interruptionscord+ 3; | |
3479 | |
3480 # NOW ADDING FORWARD MICROSATELLITES TO HASH | |
3481 my %fmicros = (); | |
3482 my $microcounter=0; | |
3483 my $linecounter = 0; | |
3484 while (my $line = <INF>){ | |
3485 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; | |
3486 $linecounter++; | |
3487 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
3488 my $key = join("\t",$1, $2, $4, $5); | |
3489 # print $key, "#-#-#-#-#-#-#-#\n"; | |
3490 push (@{$fmicros{$key}},$line); | |
3491 $microcounter++; | |
3492 } | |
3493 else { | |
3494 #print $line; | |
3495 } | |
3496 } | |
3497 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
3498 close INF; | |
3499 my @deletedlines = (); | |
3500 # print "done forward hash \n"; | |
3501 $linecounter = 0; | |
3502 #--------------------------------------------------------------------------------------------------- | |
3503 # NOW ADDING REVERSE MICROSATELLITES TO HASH | |
3504 my %rmicros = (); | |
3505 $microcounter=0; | |
3506 while (my $line = <INR>){ | |
3507 # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; | |
3508 $linecounter++; | |
3509 if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
3510 my $key = join("\t",$1, $2, $4, $5); | |
3511 # print $key, "#-#-#-#-#-#-#-#\n"; | |
3512 push (@{$rmicros{$key}},$line); | |
3513 $microcounter++; | |
3514 } | |
3515 else { | |
3516 #print "cant make key\n"; | |
3517 } | |
3518 } | |
3519 # print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
3520 close INR; | |
3521 # print "done reverse hash \n"; | |
3522 $linecounter = 0; | |
3523 | |
3524 #------------------------------------------------------------------------------------------------ | |
3525 | |
3526 while(my $sine = <SEQ>){ | |
3527 #<STDIN> if $sine =~ /16349128/; | |
3528 next if $sine !~ /[a-zA-Z0-9]/; | |
3529 # print "-" x 150, "\n" if $printer == 1; | |
3530 my @sields = split(/\t/,$sine); | |
3531 my @merged = (); | |
3532 | |
3533 my $key = (); | |
3534 | |
3535 if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
3536 $key = join("\t",$1, $2, $4, $5); | |
3537 # print $key, "<-<-<-<-<-<-<-<\n"; | |
3538 } | |
3539 # print "key = $key\n"; | |
3540 | |
3541 my @sets1; | |
3542 my @sets2; | |
3543 chomp $sields[$sequencepos]; | |
3544 my $rev_sequence = reverse($sields[$sequencepos]); | |
3545 $rev_sequence =~ s/ //g; | |
3546 $rev_sequence = " ".$rev_sequence; | |
3547 next if (!exists $fmicros{$key} && !exists $rmicros{$key}); | |
3548 | |
3549 if (exists $fmicros{$key}){ | |
3550 # print "line no : $linecount\n"; | |
3551 my @raw_microstring = @{$fmicros{$key}}; | |
3552 my %starts = (); my %ends = (); | |
3553 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} | |
3554 my @microstring=(); | |
3555 for my $u (0 ... $#raw_microstring){ | |
3556 my @tields = split(/\t/,$raw_microstring[$u]); | |
3557 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; | |
3558 push(@microstring, $raw_microstring[$u]); | |
3559 $starts{$tields[$startcord]} = $tields[$startcord]; | |
3560 $ends{$tields[$endcord]} = $tields[$endcord]; | |
3561 } | |
3562 | |
3563 # print "founf microstring in forward\n: @microstring\n"; | |
3564 chomp @microstring; | |
3565 my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos])); | |
3566 @sets1 = split("\=", $clusterresult); | |
3567 my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp); | |
3568 # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>; | |
3569 } #if (exists $micros{$key}){ | |
3570 | |
3571 if (exists $rmicros{$key}){ | |
3572 # print "line no : $linecount\n"; | |
3573 my @raw_microstring = @{$rmicros{$key}}; | |
3574 my %starts = (); my %ends = (); | |
3575 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} | |
3576 my @microstring=(); | |
3577 for my $u (0 ... $#raw_microstring){ | |
3578 my @tields = split(/\t/,$raw_microstring[$u]); | |
3579 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; | |
3580 push(@microstring, $raw_microstring[$u]); | |
3581 $starts{$tields[$startcord]} = $tields[$startcord]; | |
3582 $ends{$tields[$endcord]} = $tields[$endcord]; | |
3583 } | |
3584 # print "founf microstring in reverse\n: @microstring\n"; <STDIN>; | |
3585 chomp @microstring; | |
3586 # print "sending reversed sequence\n"; | |
3587 my $clusterresult = (find_clusters(@microstring, $rev_sequence ) ); | |
3588 @sets2 = split("\=", $clusterresult); | |
3589 my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp); | |
3590 } #if (exists $micros{$key}){ | |
3591 | |
3592 my @popout1 = (); | |
3593 my @popout2 = (); | |
3594 my @forwardset = (); | |
3595 if (exists $sets2[1] ){ | |
3596 if(exists $sets1[0]) { | |
3597 push (@popout1, $sets1[0],$sets2[1]); | |
3598 my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));# | |
3599 print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n"; | |
3600 my @localmerged = split("_X0X_", $forwardset[1]); | |
3601 my $sequence = $sields[$sequencepos]; | |
3602 $sequence =~ s/ //g; | |
3603 # print "\nforwardset = @forwardset\n"; | |
3604 for my $j (0 ... $#localmerged){ | |
3605 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); | |
3606 } | |
3607 | |
3608 push (@merged, @localmerged); | |
3609 | |
3610 } | |
3611 else{ | |
3612 my @localmerged = split("_X0X_", $sets2[1]); | |
3613 my $sequence = $sields[$sequencepos]; | |
3614 $sequence =~ s/ //g; | |
3615 for my $j (0 ... $#localmerged){ | |
3616 # print "\nlocalmerged = @localmerged\n"; | |
3617 $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); | |
3618 } | |
3619 | |
3620 push (@merged, @localmerged); | |
3621 } | |
3622 } | |
3623 elsif (exists $sets1[0]){ | |
3624 print OUTF join("\n",split("_X0X_", $sets1[0])), "\n"; | |
3625 } | |
3626 | |
3627 my @reverseset= (); | |
3628 if (exists $sets1[1]){ | |
3629 if (exists $sets2[0]){ | |
3630 push (@popout2, $sets2[0],$sets1[1]); | |
3631 # print "popout2 = @popout2\n"; | |
3632 my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos])); | |
3633 #print "reverseset = $reverseset[1] < --- reverseset1\n"; | |
3634 print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n"; | |
3635 push(@merged, (split("_X0X_", $reverseset[1]))); | |
3636 } | |
3637 else{ | |
3638 push(@merged, (split("_X0X_", $sets1[1]))); | |
3639 } | |
3640 } | |
3641 elsif (exists $sets2[0]){ | |
3642 print OUTR join("\n",split("_X0X_", $sets2[0])), "\n"; | |
3643 | |
3644 } | |
3645 | |
3646 if (scalar @merged > 0){ | |
3647 my @filtered_merged = split("__",(filterDuplicates_merged(@merged))); | |
3648 print MER join("\n", @filtered_merged),"\n"; | |
3649 } | |
3650 # <STDIN> if $sine =~ /16349128/; | |
3651 | |
3652 } | |
3653 close(SEQ); | |
3654 close(INF); | |
3655 close(INR); | |
3656 close(OUTF); | |
3657 close(OUTR); | |
3658 close(MER); | |
3659 | |
3660 } | |
3661 sub find_clusters{ | |
3662 my @input = @_; | |
3663 my $sequence = pop(@input); | |
3664 $sequence =~ s/ //g; | |
3665 my @microstring0 = @input; | |
3666 # print "IN: find_clusters:\n"; | |
3667 my %microstart=(); | |
3668 my %microend=(); | |
3669 my @nonmerged = (); | |
3670 my @mergedSet = (); | |
3671 # print "set of microsats = @microstring \n"; | |
3672 my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0; | |
3673 # print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1; | |
3674 #<STDIN> if $printer == 1; | |
3675 my @tempmicrostring = @microstring; | |
3676 foreach my $line (@tempmicrostring){ | |
3677 my @fields = split(/\t/,$line); | |
3678 my $start = $fields[$startcord]; | |
3679 my $end = $fields[$endcord]; | |
3680 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/; | |
3681 # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; | |
3682 push (@{$microstart{$start}},$line); | |
3683 push (@{$microend{$end}},$line); | |
3684 } | |
3685 my $firstflag = 'down'; | |
3686 while( my $line =shift(@microstring)){ | |
3687 # print "-----------\nline = $line \n" if $printer == 1; | |
3688 chomp $line; | |
3689 my @fields = split(/\t/,$line); | |
3690 my $start = $fields[$startcord]; | |
3691 my $end = $fields[$endcord]; | |
3692 next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ; | |
3693 my $startmicro = $line; | |
3694 my $endmicro = $line; | |
3695 # print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; | |
3696 | |
3697 delete ($microstart{$start}); | |
3698 delete ($microend{$end}); | |
3699 my $flag = 'down'; | |
3700 my $startflag = 'down'; | |
3701 my $endflag = 'down'; | |
3702 my $prestart = $start - $distance; | |
3703 my $postend = $end + $distance; | |
3704 my @compoundlines = (); | |
3705 my %compoundhash = (); | |
3706 push (@compoundlines, $line); | |
3707 push (@{$compoundhash{$line}},$line); | |
3708 my $startrank = 1; | |
3709 my $endrank = 1; | |
3710 | |
3711 while( ($startflag eq "down") || ($endflag eq "down") ){ | |
3712 # print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1; | |
3713 if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){ | |
3714 # print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1; | |
3715 last; | |
3716 } | |
3717 | |
3718 $firstflag = "up"; | |
3719 if ($startflag eq "down"){ | |
3720 for my $i ($prestart ... $end){ | |
3721 if(exists $microend{$i}){ | |
3722 chomp $microend{$i}[0]; | |
3723 if(exists $compoundhash{$microend{$i}[0]}) {next;} | |
3724 chomp $microend{$i}[0]; | |
3725 push(@compoundlines, $microend{$i}[0]); | |
3726 my @tields = split(/\t/,$microend{$i}[0]); | |
3727 $startmicro = $microend{$i}[0]; | |
3728 chomp $startmicro; | |
3729 $flag = 'down'; | |
3730 $startrank++; | |
3731 # print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1; | |
3732 delete $microend{$i}; | |
3733 delete $microstart{$tields[$startcord]}; | |
3734 $end = $tields[$endcord]; | |
3735 $startflag = 'down'; | |
3736 $prestart = $tields[$startcord] - $distance; | |
3737 last; | |
3738 } | |
3739 else{ | |
3740 $flag = 'up'; | |
3741 $startflag = 'up'; | |
3742 } | |
3743 } | |
3744 } | |
3745 | |
3746 if ($endflag eq "down"){ | |
3747 | |
3748 for my $i ($start ... $postend){ | |
3749 # print "$start ----> $i -----> $postend\n" if $printer == 1; | |
3750 if(exists $microstart{$i} ){ | |
3751 chomp $microstart{$i}[0]; | |
3752 if(exists $compoundhash{$microstart{$i}[0]}) {next;} | |
3753 chomp $microstart{$i}[0]; | |
3754 push(@compoundlines, $microstart{$i}[0]); | |
3755 my @tields = split(/\t/,$microstart{$i}[0]); | |
3756 $endmicro = $microstart{$i}[0]; | |
3757 $endrank++; | |
3758 chomp $endmicro; | |
3759 $flag = 'down'; | |
3760 # print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1; | |
3761 | |
3762 delete $microstart{$i} if exists $microstart{$i} ; | |
3763 delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]}; | |
3764 # print "done\n" if $printer == 1; | |
3765 | |
3766 shift @microstring; | |
3767 $end = $tields[$endcord]; | |
3768 $postend = $tields[$endcord] + $distance; | |
3769 $endflag = 'down'; | |
3770 last; | |
3771 } | |
3772 else{ | |
3773 $flag = 'up'; | |
3774 $endflag = 'up'; | |
3775 } | |
3776 # print "out of the if\n" if $printer == 1; | |
3777 } | |
3778 # print "out of the for\n" if $printer == 1; | |
3779 | |
3780 } | |
3781 # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; | |
3782 } #end while( $flag eq "down") | |
3783 # print "compoundlines = @compoundlines \n" if $printer == 1; | |
3784 | |
3785 if (scalar (@compoundlines) == 1){ | |
3786 push(@nonmerged, $line); | |
3787 | |
3788 } | |
3789 if (scalar (@compoundlines) > 1){ | |
3790 # print "FROM CLUSTERER\n" if $printer == 1; | |
3791 push(@mergedSet,merge_microsats(@compoundlines, $sequence) ); | |
3792 } | |
3793 } #end foreach my $line (@microstring){ | |
3794 # print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1; | |
3795 #<STDIN> if scalar(@mergedSet) > 0; | |
3796 # print "EXIT: find_clusters\n"; | |
3797 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet)); | |
3798 } | |
3799 | |
3800 sub custom { | |
3801 $a->[$startcord+1] <=> $b->[$startcord+1]; | |
3802 } | |
3803 | |
3804 sub popOuter { | |
3805 # print "\nIN: popOuter @_\n"; <STDIN>; | |
3806 my @all = split ("_X0X_",$_[0]); | |
3807 # <STDIN> if !defined $_[0]; | |
3808 my @merged = split ("_X0X_",$_[1]); | |
3809 my $sequence = $_[2]; | |
3810 my $seqlen = length($sequence); | |
3811 my %microstart=(); | |
3812 my %microend=(); | |
3813 my @mergedSet = (); | |
3814 my @nonmerged = (); | |
3815 | |
3816 foreach my $line (@all){ | |
3817 my @fields = split(/\t/,$line); | |
3818 my $start = $seqlen - $fields[$startcord]+ 1; | |
3819 my $end = $seqlen - $fields[$endcord] + 1; | |
3820 push (@{$microstart{$start}},$line); | |
3821 push (@{$microend{$end}},$line); | |
3822 } | |
3823 my $firstflag = 'down'; | |
3824 my %forPopouting = (); | |
3825 | |
3826 while( my $line =shift(@merged)){ | |
3827 # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ; | |
3828 chomp $line; | |
3829 my @fields = split(/\t/,$line); | |
3830 my $start = $fields[$startcord]; | |
3831 my $end = $fields[$endcord]; | |
3832 my $startmicro = $line; | |
3833 my $endmicro = $line; | |
3834 | |
3835 | |
3836 delete ($microstart{$start}); | |
3837 delete ($microend{$end}); | |
3838 my $flag = 'down'; | |
3839 my $startflag = 'down'; | |
3840 my $endflag = 'down'; | |
3841 my $prestart = $start - $distance; | |
3842 my $postend = $end + $distance; | |
3843 my @compoundlines = (); | |
3844 my %compoundhash = (); | |
3845 push (@compoundlines, $line); | |
3846 my $startrank = 1; | |
3847 my $endrank = 1; | |
3848 | |
3849 # print "\nstart = $start, end = $end\n"; | |
3850 # <STDIN>; | |
3851 for my $i ($start ... $end){ | |
3852 if(exists $microend{$i}){ | |
3853 # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n"; | |
3854 chomp $microend{$i}[0]; | |
3855 my @fields = split(/\t/,$microend{$i}[0]); | |
3856 delete $microstart{$seqlen - $fields[$startcord] + 1}; | |
3857 my $invertseq = $sequence; | |
3858 $invertseq =~ s/ //g; | |
3859 push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq )); | |
3860 delete $microend{$i}; | |
3861 | |
3862 } | |
3863 | |
3864 if(exists $microstart{$i} ){ | |
3865 # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n"; | |
3866 | |
3867 chomp $microstart{$i}[0]; | |
3868 my @fields = split(/\t/,$microstart{$i}[0]); | |
3869 delete $microend{$seqlen - $fields[$endcord] + 1}; | |
3870 my $invertseq = $sequence; | |
3871 $invertseq =~ s/ //g; | |
3872 push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) ); | |
3873 delete $microstart{$i}; | |
3874 } | |
3875 } | |
3876 | |
3877 if (scalar (@compoundlines) == 1){ | |
3878 push(@mergedSet,join("\t",@compoundlines) ); | |
3879 } | |
3880 else { | |
3881 # print "FROM POPOUTER\n" if $printer == 1; | |
3882 push(@mergedSet, merge_microsats(@compoundlines, $sequence) ); | |
3883 } | |
3884 } | |
3885 | |
3886 foreach my $key (sort keys %microstart) { | |
3887 push(@nonmerged,$microstart{$key}[0]); | |
3888 } | |
3889 | |
3890 return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) ); | |
3891 } | |
3892 | |
3893 | |
3894 | |
3895 sub invert_justCoordinates{ | |
3896 my $microsat = $_[0]; | |
3897 # print "IN invert_justCoordinates ... @_\n" ; <STDIN>; | |
3898 chomp $microsat; | |
3899 my $seqLength = $_[1]; | |
3900 my @fields = split(/\t/,$microsat); | |
3901 my $start = $seqLength - $fields[$endcord] + 1; | |
3902 my $end = $seqLength - $fields[$startcord] + 1; | |
3903 $fields[$startcord] = $start; | |
3904 $fields[$endcord] = $end; | |
3905 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); | |
3906 # print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1; | |
3907 return join("\t",@fields); | |
3908 } | |
3909 | |
3910 sub largest_number{ | |
3911 my $counter = 0; | |
3912 my($max) = shift(@_); | |
3913 foreach my $temp (@_) { | |
3914 #print "finding largest array: $maxcounter \n"; | |
3915 if($temp > $max){ | |
3916 $max = $temp; | |
3917 } | |
3918 } | |
3919 return($max); | |
3920 } | |
3921 sub smallest_number{ | |
3922 my $counter = 0; | |
3923 my($min) = shift(@_); | |
3924 foreach my $temp (@_) { | |
3925 #print "finding largest array: $maxcounter \n"; | |
3926 if($temp < $min){ | |
3927 $min = $temp; | |
3928 } | |
3929 } | |
3930 return($min); | |
3931 } | |
3932 | |
3933 | |
3934 sub filterDuplicates_merged{ | |
3935 my @merged = @_; | |
3936 my %revmerged = (); | |
3937 my @fmerged = (); | |
3938 foreach my $micro (@merged) { | |
3939 my @fields = split(/\t/,$micro); | |
3940 if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){ | |
3941 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); | |
3942 # print "adding ... \n$key\n$micro\n"; | |
3943 push(@{$revmerged{$key}}, $micro); | |
3944 } | |
3945 else{ | |
3946 # print "pushing.. $micro\n"; | |
3947 push(@fmerged, $micro); | |
3948 } | |
3949 } | |
3950 # print "\n"; | |
3951 foreach my $micro (@fmerged) { | |
3952 my @fields = split(/\t/,$micro); | |
3953 my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); | |
3954 # print "searching for key $key\n"; | |
3955 if (exists $revmerged{$key}){ | |
3956 # print "deleting $revmerged{$key}[0]\n"; | |
3957 delete $revmerged{$key}; | |
3958 } | |
3959 } | |
3960 foreach my $key (sort keys %revmerged) { | |
3961 push(@fmerged,$revmerged{$key}[0]); | |
3962 } | |
3963 # print "returning ", join("\n", @fmerged),"\n" ; | |
3964 return join("__", @fmerged); | |
3965 } | |
3966 | |
3967 sub invert_microsat{ | |
3968 my $micro = $_[0]; | |
3969 chomp $micro; | |
3970 if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;} | |
3971 else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; } | |
3972 my $sequence = $_[1]; | |
3973 $sequence =~ s/ //g; | |
3974 my $seqlen = length($sequence); | |
3975 my @fields = split(/\t/,$micro); | |
3976 my $start = $seqlen - $fields[$endcord] +1; | |
3977 my $end = $seqlen - $fields[$startcord] +1; | |
3978 $fields[$startcord] = $start; | |
3979 $fields[$endcord] = $end; | |
3980 $fields[$motifcord] = reverse_micro($fields[$motifcord]); | |
3981 $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); | |
3982 if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){ | |
3983 my @intertypes = split(/,/,$fields[$interrtypecord]); | |
3984 my @inters = split(/,/,$fields[$interrcord]); | |
3985 my @interposes = split(/,/,$fields[$interr_poscord]); | |
3986 $fields[$interrtypecord] = join(",",reverse(@intertypes)); | |
3987 $fields[$no_of_interruptionscord] = scalar(@interposes); | |
3988 for my $i (0 ... $fields[$no_of_interruptionscord]-1){ | |
3989 if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){ | |
3990 $inters[$i] = reverse($inters[$i]); | |
3991 $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1; | |
3992 } | |
3993 else{ | |
3994 $inters[$i] = ""; | |
3995 $interposes[$i] = $interposes[$i] - 1; | |
3996 } | |
3997 $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1; | |
3998 } | |
3999 | |
4000 $fields[$interrcord] = join(",",reverse(@inters)); | |
4001 $fields[$interr_poscord] = join(",",reverse(@interposes)); | |
4002 } | |
4003 | |
4004 my $finalmicrosat = join("\t", @fields); | |
4005 return $finalmicrosat; | |
4006 | |
4007 } | |
4008 sub reverse_micro{ | |
4009 my $micro = reverse($_[0]); | |
4010 my @strand = split(/\s*/,$micro); | |
4011 for my $i (0 ... $#strand){ | |
4012 if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} | |
4013 if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} | |
4014 } | |
4015 return join("",@strand); | |
4016 } | |
4017 | |
4018 #xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx | |
4019 | |
4020 | |
4021 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx | |
4022 | |
4023 sub forward_reverse_sputoutput_comparer { | |
4024 # print "IN forward_reverse_sputoutput_comparer: @_\n"; | |
4025 my $input0 = $_[0]; ###### the *nogap_unrand_match file | |
4026 my $input1 = $_[1]; ###### the real file, *sput* data | |
4027 my $input2 = $_[2]; ###### the reverse file, *sput* data | |
4028 my $output1 = $_[3]; ###### microsats different in real file | |
4029 my $output2 = $_[4]; ###### microsats missing in real file | |
4030 my $output3 = $_[5]; ###### microsats common among real and reverse file | |
4031 my $no_of_species = $_[6]; | |
4032 | |
4033 $infocord = 2 + (4*$no_of_species) - 1; | |
4034 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
4035 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
4036 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
4037 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
4038 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
4039 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
4040 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
4041 $interrtypecord = $motifcord + 1; | |
4042 $interrcord = $motifcord + 2; | |
4043 $interr_poscord = $motifcord + 3; | |
4044 $no_of_interruptionscord = $motifcord + 4; | |
4045 $mergestarts = $no_of_interruptionscord+ 1; | |
4046 $mergeends = $no_of_interruptionscord+ 2; | |
4047 $mergemicros = $no_of_interruptionscord+ 3; | |
4048 | |
4049 | |
4050 open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; | |
4051 open(INF,"<$input1") or die "Cannot open file $input1 $!"; | |
4052 open(INR,"<$input2") or die "Cannot open file $input2 $!"; | |
4053 | |
4054 open(DIFF,">$output1") or die "Cannot open file $output1 $!"; | |
4055 #open(MISS,">$output2") or die "Cannot open file $output2 $!"; | |
4056 open(SAME,">$output3") or die "Cannot open file $output3 $!"; | |
4057 | |
4058 | |
4059 # print "opened files \n"; | |
4060 my $linecounter = 0; | |
4061 my $fcounter = 0; | |
4062 my $rcounter = 0; | |
4063 | |
4064 $printer = 0; | |
4065 #--------------------------------------------------------------------------------------------------- | |
4066 # NOW ADDING FORWARD MICROSATELLITES TO HASH | |
4067 my %fmicros = (); | |
4068 my $microcounter=0; | |
4069 while (my $line = <INF>){ | |
4070 $linecounter++; | |
4071 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
4072 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); | |
4073 # print $key, "#-#-#-#-#-#-#-#\n"; | |
4074 push (@{$fmicros{$key}},$line); | |
4075 $microcounter++; | |
4076 } | |
4077 else { | |
4078 #print $line; | |
4079 } | |
4080 } | |
4081 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
4082 close INF; | |
4083 my @deletedlines = (); | |
4084 # print "done forward hash \n"; | |
4085 $linecounter = 0; | |
4086 #--------------------------------------------------------------------------------------------------- | |
4087 # NOW ADDING REVERSE MICROSATELLITES TO HASH | |
4088 my %rmicros = (); | |
4089 $microcounter=0; | |
4090 while (my $line = <INR>){ | |
4091 $linecounter++; | |
4092 if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
4093 my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); | |
4094 # print $key, "#-#-#-#-#-#-#-#\n"; | |
4095 push (@{$rmicros{$key}},$line); | |
4096 $microcounter++; | |
4097 } | |
4098 else {} | |
4099 } | |
4100 # print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; | |
4101 close INR; | |
4102 # print "done reverse hash \n"; | |
4103 $linecounter = 0; | |
4104 #--------------------------------------------------------------------------------------------------- | |
4105 #--------------------------------------------------------------------------------------------------- | |
4106 # NOW READING THE SEQUENCE FILE | |
4107 while(my $sine = <SEQ>){ | |
4108 my %microstart=(); | |
4109 my %microend=(); | |
4110 my @sields = split(/\t/,$sine); | |
4111 my $key = (); | |
4112 if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
4113 $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); | |
4114 } | |
4115 else { | |
4116 next; | |
4117 } | |
4118 $printer = 0; | |
4119 my $sequence = $sields[$sequencepos]; | |
4120 chomp $sequence; | |
4121 $sequence =~ s/ //g; | |
4122 my @localfs = (); | |
4123 my @localrs = (); | |
4124 | |
4125 if (exists $fmicros{$key}){ | |
4126 @localfs = @{$fmicros{$key}}; | |
4127 delete $fmicros{$key}; | |
4128 } | |
4129 | |
4130 my %forwardstarts = (); | |
4131 my %forwardends = (); | |
4132 | |
4133 foreach my $f (@localfs){ | |
4134 my @fields = split(/\t/,$f); | |
4135 push (@{$forwardstarts{$fields[$startcord]}},$f); | |
4136 push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]); | |
4137 } | |
4138 | |
4139 if (exists $rmicros{$key}){ | |
4140 @localrs = @{$rmicros{$key}}; | |
4141 delete $rmicros{$key}; | |
4142 } | |
4143 else{ | |
4144 } | |
4145 | |
4146 foreach my $r (@localrs){ | |
4147 chomp $r; | |
4148 my @rields = split(/\t/,$r); | |
4149 # print "rields = @rields\n" if $printer == 1; | |
4150 my $reciprocalstart = length($sequence) - $rields[$endcord] + 1; | |
4151 my $reciprocalend = length($sequence) - $rields[$startcord] + 1; | |
4152 # print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1; | |
4153 my $microsat = reverse_micro(all_caps($rields[$microsatcord])); | |
4154 my @localcollection=(); | |
4155 for my $i ($reciprocalstart+1 ... $reciprocalend-1){ | |
4156 if (exists $forwardstarts{$i}){ | |
4157 push(@localcollection, $forwardstarts{$i}[0] ); | |
4158 delete $forwardstarts{$i}; | |
4159 } | |
4160 if (exists $forwardends{$i}){ | |
4161 next if !exists $forwardstarts{$forwardends{$i}[0]}; | |
4162 push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] ); | |
4163 } | |
4164 } | |
4165 if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);} | |
4166 | |
4167 if (scalar(@localcollection) == 0){ | |
4168 print SAME invert_microsat($r,($sequence) ), "\n"; | |
4169 } | |
4170 | |
4171 elsif (scalar(@localcollection) == 1){ | |
4172 # print "f microsat = $localcollection[0]\n" if $printer == 1; | |
4173 my @lields = split(/\t/,$localcollection[0]); | |
4174 $lields[$microsatcord]=all_caps($lields[$microsatcord]); | |
4175 # print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1; | |
4176 # print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1; | |
4177 if ($microsat eq $lields[$microsatcord]){ | |
4178 chomp $localcollection[0]; | |
4179 print SAME $localcollection[0], "\n"; | |
4180 } | |
4181 if ($microsat ne $lields[$microsatcord]){ | |
4182 chomp $localcollection[0]; | |
4183 my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence); | |
4184 # print "newmicro = $newmicro\n" if $printer == 1; | |
4185 if ($newmicro =~ /[a-zA-Z]/){ | |
4186 print SAME $newmicro,"\n"; | |
4187 } | |
4188 else{ | |
4189 print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n"; | |
4190 # print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1; | |
4191 # print "@rields\n@lields\n" if $printer == 1; | |
4192 } | |
4193 } | |
4194 } | |
4195 else{ | |
4196 # print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1; | |
4197 } | |
4198 } | |
4199 } | |
4200 | |
4201 close(SEQ); | |
4202 close(INF); | |
4203 close(INR); | |
4204 close(DIFF); | |
4205 close(SAME); | |
4206 | |
4207 } | |
4208 sub all_caps{ | |
4209 my @strand = split(/\s*/,$_[0]); | |
4210 for my $i (0 ... $#strand){ | |
4211 if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} | |
4212 if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} | |
4213 if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} | |
4214 if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} | |
4215 } | |
4216 return join("",@strand); | |
4217 } | |
4218 sub microsatChooser{ | |
4219 my $forward = $_[0]; | |
4220 my $reverse = $_[1]; | |
4221 my $sequence = $_[2]; | |
4222 my $seqLength = length($sequence); | |
4223 $sequence =~ s/ //g; | |
4224 my @fields = split(/\t/,$forward); | |
4225 my @rields = split(/\t/,$reverse); | |
4226 my $r_start = $seqLength - $rields[$endcord] + 1; | |
4227 my $r_end = $seqLength - $rields[$startcord] + 1; | |
4228 | |
4229 | |
4230 my $f_microsat = $fields[$microsatcord]; | |
4231 my $r_microsat = $rields[$microsatcord]; | |
4232 | |
4233 if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) { | |
4234 return $forward if length($f_microsat) >= length($r_microsat); | |
4235 return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat); | |
4236 } | |
4237 return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord]; | |
4238 | |
4239 my $f_microsat_copy = $f_microsat; | |
4240 my $r_microsat_copy = $r_microsat; | |
4241 $f_microsat_copy =~ s/^\[|\]$//g; | |
4242 $r_microsat_copy =~ s/^\[|\]$//g; | |
4243 | |
4244 my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); | |
4245 my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); | |
4246 my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); | |
4247 my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); | |
4248 | |
4249 my $f_motif = $fields[$motifcord]; | |
4250 my $r_motif = $rields[$motifcord]; | |
4251 my $f_motif_copy = $f_motif; | |
4252 my $r_motif_copy = $r_motif; | |
4253 $f_motif_copy =~ s/^\[|\]$//g; | |
4254 $r_motif_copy =~ s/^\[|\]$//g; | |
4255 | |
4256 my @f_motields = split(/\]\[/,$f_motif_copy); | |
4257 my @r_motields = split(/\]\[/,$r_motif_copy); | |
4258 | |
4259 my $f_purestretch = join("",@f_microields); | |
4260 my $r_purestretch = join("",@r_microields); | |
4261 | |
4262 if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){ | |
4263 # print "now.. studying $forward\n$reverse\n" if $printer == 1; | |
4264 if ($fields[$typecord] eq $rields[$typecord]){ | |
4265 # print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; | |
4266 | |
4267 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ | |
4268 my $subset_answer = isSubset($forward, $reverse, $seqLength); | |
4269 # print "subset answer = $subset_answer\n" if $printer == 1; | |
4270 return $forward if $subset_answer == 1; | |
4271 return invert_microsat($reverse, $sequence) if $subset_answer == 2; | |
4272 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); | |
4273 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); | |
4274 return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch); | |
4275 return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch); | |
4276 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ; | |
4277 } | |
4278 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ | |
4279 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4280 } | |
4281 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ | |
4282 return $forward; | |
4283 } | |
4284 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ | |
4285 return invert_microsat($reverse, $sequence); | |
4286 } | |
4287 } | |
4288 else{ | |
4289 my $fmotlen = (); | |
4290 my $rmotlen = (); | |
4291 $fmotlen =1 if $fields[$typecord] eq "mononucleotide"; | |
4292 $fmotlen =2 if $fields[$typecord] eq "dinucleotide"; | |
4293 $fmotlen =3 if $fields[$typecord] eq "trinucleotide"; | |
4294 $fmotlen =4 if $fields[$typecord] eq "tetranucleotide"; | |
4295 $rmotlen =1 if $rields[$typecord] eq "mononucleotide"; | |
4296 $rmotlen =2 if $rields[$typecord] eq "dinucleotide"; | |
4297 $rmotlen =3 if $rields[$typecord] eq "trinucleotide"; | |
4298 $rmotlen =4 if $rields[$typecord] eq "tetranucleotide"; | |
4299 | |
4300 if ($fmotlen < $rmotlen){ | |
4301 if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){ | |
4302 return $forward; | |
4303 } | |
4304 else{ | |
4305 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4306 } | |
4307 } | |
4308 if ($fmotlen > $rmotlen){ | |
4309 if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){ | |
4310 return invert_microsat($reverse, $sequence); | |
4311 } | |
4312 else{ | |
4313 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4314 } | |
4315 } | |
4316 } | |
4317 } | |
4318 if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){ | |
4319 # print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; | |
4320 if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ | |
4321 my $subset_answer = isSubset($forward, $reverse, $seqLength); | |
4322 # print "subset answer = $subset_answer\n" if $printer == 1; | |
4323 return $forward if $subset_answer == 1; | |
4324 return invert_microsat($reverse, $sequence) if $subset_answer == 2; | |
4325 # print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1; | |
4326 return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); | |
4327 return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); | |
4328 if ($subset_answer == 3){ | |
4329 if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){ | |
4330 if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){ | |
4331 return $forward; | |
4332 } | |
4333 else{ | |
4334 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4335 } | |
4336 } | |
4337 if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){ | |
4338 if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){ | |
4339 return invert_microsat($reverse, $sequence); | |
4340 } | |
4341 else{ | |
4342 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4343 } | |
4344 } | |
4345 } | |
4346 } | |
4347 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ | |
4348 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4349 } | |
4350 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ | |
4351 return $forward; | |
4352 } | |
4353 elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ | |
4354 return invert_microsat($reverse, $sequence); | |
4355 } | |
4356 | |
4357 } | |
4358 | |
4359 if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){ | |
4360 # print "one compound, one nucleotide\n" if $printer == 1; | |
4361 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4362 } | |
4363 if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){ | |
4364 # print "one compound, one nucleotide\n" if $printer == 1; | |
4365 return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); | |
4366 } | |
4367 } | |
4368 | |
4369 sub isSubset{ | |
4370 my $forward = $_[0]; my @fields = split(/\t/,$forward); | |
4371 my $reverse = $_[1]; my @rields = split(/\t/,$reverse); | |
4372 my $seqLength = $_[2]; | |
4373 my $r_start = $seqLength - $rields[$endcord] + 1; | |
4374 my $r_end = $seqLength - $rields[$startcord] + 1; | |
4375 # print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1; | |
4376 return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end; | |
4377 return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end; | |
4378 return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord]; | |
4379 return "3"; | |
4380 } | |
4381 | |
4382 sub motifBYmotif_match{ | |
4383 my $forward = $_[0]; | |
4384 my $reverse = $_[1]; | |
4385 $forward =~ s/^\[|\]$//g; | |
4386 $reverse =~ s/^\[|\]$//g; | |
4387 my @f_motields=split(/\]\[/, $forward); | |
4388 my @r_motields=split(/\]\[/, $reverse); | |
4389 my $finalresult = 0; | |
4390 | |
4391 if (scalar(@f_motields) != scalar(@r_motields)){ | |
4392 my $subresult = 0; | |
4393 my @mega = (); my @sub = (); | |
4394 @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields); | |
4395 @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields); | |
4396 @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields); | |
4397 @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields); | |
4398 | |
4399 for my $i (0 ... $#sub){ | |
4400 my $probe = $sub[$i].$sub[$i]; | |
4401 # print "probing $probe and $mega[$i]\n" if $printer == 1; | |
4402 if ($probe =~ /$mega[$i]/) {$subresult = 1; } | |
4403 else {$subresult = 0; last; } | |
4404 } | |
4405 | |
4406 return 0 if $subresult == 0; | |
4407 return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f | |
4408 return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse | |
4409 | |
4410 } | |
4411 else{ | |
4412 for my $i (0 ... $#f_motields){ | |
4413 my $probe = $f_motields[$i].$f_motields[$i]; | |
4414 if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;} | |
4415 else {$finalresult = 0 ;last;} | |
4416 } | |
4417 } | |
4418 # print "finalresult = $finalresult\n" if $printer == 1; | |
4419 return $finalresult; | |
4420 } | |
4421 | |
4422 sub merge_microsats{ | |
4423 my @input = @_; | |
4424 my $sequence = pop(@input); | |
4425 $sequence =~ s/ //g; | |
4426 my @seq_string = @input; | |
4427 # print "IN: merge_microsats\n"; | |
4428 # print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n"; | |
4429 my $start; | |
4430 my $end; | |
4431 my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string; | |
4432 # print "\nrearranged into @micros \n"; | |
4433 my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = (); | |
4434 my @fields = (); | |
4435 for my $i (0 ... $#micros){ | |
4436 chomp $micros[$i]; | |
4437 @fields = split(/\t/,$micros[$i]); | |
4438 push(@types, $fields[$typecord]); | |
4439 push(@motifs, $fields[$motifcord]); | |
4440 | |
4441 if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);} | |
4442 else { push(@interruptiontypes, "NA"); } | |
4443 if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);} | |
4444 else { push(@interruptions, "NA"); } | |
4445 if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);} | |
4446 else { push(@interrposes, "NA"); } | |
4447 if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);} | |
4448 else { push(@no_of_interruptions, "NA"); } | |
4449 if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));} | |
4450 else { push(@mergestart, $fields[$startcord]); } | |
4451 if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));} | |
4452 else { push(@mergeend, $fields[$endcord]); } | |
4453 if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);} | |
4454 else { push(@mergemicro, $fields[$microsatcord]); } | |
4455 | |
4456 | |
4457 } | |
4458 $start = smallest_number(@mergestart); | |
4459 $end = largest_number(@mergeend); | |
4460 my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]"; | |
4461 my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro)); | |
4462 return $microsat; | |
4463 } | |
4464 | |
4465 sub slided_microsat{ | |
4466 my $forward = $_[0]; my @fields = split(/\t/,$forward); | |
4467 my $reverse = $_[1]; my @rields = split(/\t/,$reverse); | |
4468 my $seqLength = $_[2]; | |
4469 my $r_start = $seqLength - $rields[$endcord] + 1; | |
4470 my $r_end = $seqLength - $rields[$startcord] + 1; | |
4471 my $motlen =(); | |
4472 $motlen =1 if $fields[$typecord] eq "mononucleotide"; | |
4473 $motlen =2 if $fields[$typecord] eq "dinucleotide"; | |
4474 $motlen =3 if $fields[$typecord] eq "trinucleotide"; | |
4475 $motlen =4 if $fields[$typecord] eq "tetranucleotide"; | |
4476 | |
4477 if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) { | |
4478 return 0; | |
4479 } | |
4480 else{ | |
4481 return 1; | |
4482 } | |
4483 | |
4484 } | |
4485 | |
4486 #xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx | |
4487 | |
4488 | |
4489 | |
4490 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx | |
4491 sub new_multispecies_t10{ | |
4492 my $input1 = $_[0]; #gap_op_unrand_match | |
4493 my $input2 = $_[1]; #sput | |
4494 my $output = $_[2]; #output | |
4495 my $bin = $output."_bin"; | |
4496 my $orgs = join("|",split(/\./,$_[3])); | |
4497 my @organisms = split(/\./,$_[3]); | |
4498 my $no_of_species = scalar(@organisms); #3 | |
4499 my $t10info = $output."_info"; | |
4500 $prinkter = 0; | |
4501 | |
4502 open (MATCH, "<$input1"); | |
4503 open (SPUT, "<$input2"); | |
4504 open (OUT, ">$output"); | |
4505 open (INFO, ">$t10info"); | |
4506 | |
4507 | |
4508 sub microsat_bracketer; | |
4509 sub custom; | |
4510 my %seen = (); | |
4511 $infocord = 2 + (4*$no_of_species) - 1; | |
4512 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
4513 $startcord = 2 + (4*$no_of_species) + 2 - 1; | |
4514 $strandcord = 2 + (4*$no_of_species) + 3 - 1; | |
4515 $endcord = 2 + (4*$no_of_species) + 4 - 1; | |
4516 $microsatcord = 2 + (4*$no_of_species) + 5 - 1; | |
4517 $motifcord = 2 + (4*$no_of_species) + 6 - 1; | |
4518 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
4519 #---------------------------------------------------------------------------------------------------------------# | |
4520 # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES # | |
4521 #---------------------------------------------------------------------------------------------------------------# | |
4522 my $linecounter = 0; | |
4523 my $microcounter = 0; | |
4524 while (my $line = <SPUT>){ | |
4525 chomp $line; | |
4526 # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; | |
4527 next if $line !~ /[0-9a-z]+/; | |
4528 $linecounter++; | |
4529 # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); | |
4530 # print $key, "#-#-#-#-#-#-#-#\n"; | |
4531 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
4532 my $key = join("\t",$1, $2, $3, $4, $5); | |
4533 # print "key = $key\n" if $prinkter == 1; | |
4534 push (@{$seen{$key}},$line); | |
4535 $microcounter++; | |
4536 } | |
4537 else { | |
4538 #print "could not make ker in SPUT : \n$line \n"; | |
4539 } | |
4540 } | |
4541 # print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; | |
4542 # print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; | |
4543 close SPUT; | |
4544 | |
4545 #---------------------------------------------------------------------------------------------------------------- | |
4546 | |
4547 #-------------------------------------------------------------------------------------------------------# | |
4548 # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY # | |
4549 # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN # | |
4550 # SECOND FILE FOLLOWED BY # | |
4551 # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION # | |
4552 #-------------------------------------------------------------------------------------------------------# | |
4553 my $ref = 0; | |
4554 my $ref2 = 0; | |
4555 my $ref3 = 0; | |
4556 my $ref4 = 0; | |
4557 my $deletes= 0; | |
4558 my $duplicates = 0; | |
4559 my $neighbors = 0; | |
4560 my $tooshort = 0; | |
4561 my $prevmicrol=(); | |
4562 my $startnotfound = 0; | |
4563 my $matchkeysformed = 0; | |
4564 my $keysused = 0; | |
4565 | |
4566 while (my $line = <MATCH>) { | |
4567 # print colored ['magenta'], $line if $prinkter == 1; | |
4568 next if $line !~ /[a-zA-Z0-9]/; | |
4569 chomp $line; | |
4570 my @fields2 = split(/\t/,$line); | |
4571 my $key2 = (); | |
4572 # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); | |
4573 if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { | |
4574 $matchkeysformed++; | |
4575 $key2 = join("\t",$1, $2, $3, $4, $5); | |
4576 # print "key = $key2 \n" if $prinkter == 1; | |
4577 } | |
4578 else{ | |
4579 # print "could not make ker in SEQ : $line\n"; | |
4580 next; | |
4581 } | |
4582 my $sequence = $fields2[$sequencepos]; | |
4583 $sequence =~ s/\*/-/g; | |
4584 my $count = 0; | |
4585 if (exists $seen{$key2}){ | |
4586 $keysused++; | |
4587 my @unsorted_raw = @{$seen{$key2}}; | |
4588 delete $seen{$key2}; | |
4589 my @sequencearr = split(/\s*/, $sequence); | |
4590 | |
4591 # print "sequencearr = @sequencearr\n" if $prinkter == 1; | |
4592 | |
4593 my $counter; | |
4594 | |
4595 my %start_database = (); | |
4596 my %end_database = (); | |
4597 foreach my $uns (@unsorted_raw){ | |
4598 my @uields = split(/\t/,$uns); | |
4599 $start_database{$uields[$startcord]} = $uns; | |
4600 $end_database{$uields[$endcord]} = $uns; | |
4601 } | |
4602 | |
4603 my @unsorted = (); | |
4604 my %starts = (); my %ends = (); | |
4605 # print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;} | |
4606 for my $u (0 ... $#unsorted_raw){ | |
4607 my @tields = split(/\t/,$unsorted_raw[$u]); | |
4608 next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; | |
4609 push(@unsorted, $unsorted_raw[$u]); | |
4610 $starts{$tields[$startcord]} = $unsorted_raw[$u]; | |
4611 # print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1; | |
4612 } | |
4613 | |
4614 my $basecounter= 0; | |
4615 my $gapcounter = 0; | |
4616 my $poscounter = 0; | |
4617 | |
4618 for my $s (@sequencearr){ | |
4619 | |
4620 $poscounter++; | |
4621 if ($s eq "-"){ | |
4622 $gapcounter++; next; | |
4623 } | |
4624 else{ | |
4625 $basecounter++; | |
4626 } | |
4627 | |
4628 | |
4629 #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; | |
4630 #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; | |
4631 #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1; | |
4632 | |
4633 if (exists $starts{$basecounter}){ | |
4634 my $locus = $starts{$basecounter}; | |
4635 # print "locus identified = $locus\n" if $prinkter == 1; | |
4636 my @fields3 = split(/\t/,$locus); | |
4637 my $start = $fields3[$startcord]; | |
4638 my $end = $fields3[$endcord]; | |
4639 my $motif = $fields3[$motifcord]; | |
4640 my $microsat = $fields3[$microsatcord]; | |
4641 my @leftbracketpos = (); | |
4642 my @rightbracketpos = (); | |
4643 my $bracket_picker = 'no'; | |
4644 my $leftbrackets=(); | |
4645 my $rightbrackets = (); | |
4646 my $micro_cpy = $microsat; | |
4647 # print "microsat = $microsat\n" if $prinkter == 1; | |
4648 while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';} | |
4649 while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);} | |
4650 $microsat =~ s/[\[\]\-\*]//g; | |
4651 # print "microsat = $microsat\n" if $prinkter == 1; | |
4652 my $human_search = join '-*', split //, $microsat; | |
4653 my $temp = substr($sequence, $poscounter-1); | |
4654 # print "with poscounter = $poscounter\n" if $prinkter == 1; | |
4655 my $search_result = (); | |
4656 my $posnow = (); | |
4657 | |
4658 # print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search; | |
4659 # <STDIN> if !defined $temp || !defined $human_search; | |
4660 | |
4661 while ($temp =~ /($human_search)/gi){ | |
4662 $search_result = $1; | |
4663 # $posnow = pos($temp); | |
4664 last; | |
4665 } | |
4666 | |
4667 my @gapspos = (); | |
4668 next if !defined $search_result; | |
4669 | |
4670 while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); } | |
4671 my $gaps = join("__",@gapspos); | |
4672 | |
4673 my $final_microsat = $search_result; | |
4674 if ($bracket_picker eq "yes"){ | |
4675 $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets); | |
4676 } | |
4677 | |
4678 my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat); | |
4679 | |
4680 if ($bracket_picker eq "yes") { | |
4681 $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]); | |
4682 } | |
4683 print OUT $outsentence,"\n"; | |
4684 } | |
4685 } | |
4686 } | |
4687 } | |
4688 my $unusedkeys = scalar(keys %seen); | |
4689 # print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; | |
4690 # print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; | |
4691 # print INFO "microsatellites too short = $tooshort\n"; | |
4692 # print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; | |
4693 | |
4694 #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; | |
4695 #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; | |
4696 #print "microsatellites too short = $tooshort\n"; | |
4697 #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; | |
4698 #print "unused keys = \n",join("\n", (keys %seen)),"\n"; | |
4699 close (MATCH); | |
4700 close (SPUT); | |
4701 close (OUT); | |
4702 close (INFO); | |
4703 } | |
4704 | |
4705 sub microsat_bracketer{ | |
4706 # print "in bracketer: @_\n"; | |
4707 my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_; | |
4708 my @gaps = split(/__/,$gapspos); | |
4709 my @lefts = split(/__/,$leftbracketpos); | |
4710 my @rights = split(/__/,$rightbracketpos); | |
4711 my @new=(); | |
4712 my $pure = $microsat; | |
4713 $pure =~ s/-//g; | |
4714 my $off = 0; | |
4715 my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights); | |
4716 push(@gaps, 0); | |
4717 push(@lefts,0); | |
4718 push(@rights,0); | |
4719 | |
4720 for my $i (1 ... $finallength){ | |
4721 # print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n"; | |
4722 if($rights[0] == $i){ | |
4723 # print "pushed a ]\n"; | |
4724 push(@new, "]"); | |
4725 shift(@rights); | |
4726 push(@rights,0); | |
4727 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} | |
4728 next; | |
4729 } | |
4730 if($gaps[0] == $i){ | |
4731 # print "pushed a -\n"; | |
4732 push(@new, "-"); | |
4733 shift(@gaps); | |
4734 push(@gaps, 0); | |
4735 for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;} | |
4736 for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;} | |
4737 | |
4738 next; | |
4739 } | |
4740 if($lefts[0] == $i){ | |
4741 # print "pushed a [\n"; | |
4742 push(@new, "["); | |
4743 shift(@lefts); | |
4744 push(@lefts,0); | |
4745 for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} | |
4746 next; | |
4747 } | |
4748 else{ | |
4749 my $pushed = substr($pure,$off,1); | |
4750 $off++; | |
4751 push(@new,$pushed ); | |
4752 # print "pushed an alphabet, now new = @new, pushed = $pushed\n"; | |
4753 next; | |
4754 } | |
4755 } | |
4756 my $returnmicrosat = join("",@new); | |
4757 # print "final microsat = $returnmicrosat \n"; | |
4758 return($returnmicrosat); | |
4759 } | |
4760 | |
4761 #xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx | |
4762 | |
4763 | |
4764 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx | |
4765 sub multiSpecies_orthFinder4{ | |
4766 #print "IN multiSpecies_orthFinder4: @_\n"; | |
4767 my @handles = (); | |
4768 #1 SEPT 30TH 2008 | |
4769 #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER | |
4770 #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL | |
4771 #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST | |
4772 #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED | |
4773 #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY | |
4774 #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES. | |
4775 #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE | |
4776 #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE | |
4777 #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN | |
4778 #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl | |
4779 | |
4780 #1 trial run: | |
4781 #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50 | |
4782 | |
4783 $prinkter=0; | |
4784 | |
4785 ############# | |
4786 my $CLUSTER_DIST = $_[4]; | |
4787 ############# | |
4788 | |
4789 | |
4790 my $aligns = $_[0]; | |
4791 my @micros = split(/:/, $_[1]); | |
4792 my $orth = $_[2]; | |
4793 #my $not_orth = "notorth"; | |
4794 @tags = split(/:/, $_[3]); | |
4795 | |
4796 $no_of_species=scalar(@tags); | |
4797 my $junkfile = $orth."_junk"; | |
4798 #open(JUNK,">$junkfile"); | |
4799 | |
4800 #my $info = $output1."_info"; | |
4801 #print "inputs are : \n"; foreach(@micros){print $_,"\n";} | |
4802 #print "info = @_\n"; | |
4803 | |
4804 | |
4805 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; | |
4806 open (ORTH, ">$orth"); | |
4807 my $output=$orth."_out"; | |
4808 open (OUTP, ">$output"); | |
4809 | |
4810 | |
4811 #open (NORTH, ">$not_orth"); | |
4812 #open (INF, ">$info"); | |
4813 my $i = 0; | |
4814 foreach my $path (@micros){ | |
4815 $handles[$i] = IO::Handle->new(); | |
4816 open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!"; | |
4817 $i++; | |
4818 } | |
4819 | |
4820 #print "Opened files\n"; | |
4821 | |
4822 | |
4823 $infocord = 2 + (4*$no_of_species) - 1; | |
4824 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
4825 $motifcord = $typecord + 1; | |
4826 $gapcord = $motifcord+1; | |
4827 $startcord = $gapcord + 1; | |
4828 $strandcord = $startcord + 1; | |
4829 $endcord = $strandcord + 1; | |
4830 $microsatcord = $endcord + 1; | |
4831 $sequencepos = 2 + (4*$no_of_species) + 1 -1 ; | |
4832 #$sequencepos = 17; | |
4833 # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES | |
4834 #---------------------------------------------------------------------------------------------------------------- | |
4835 my @hasharr = (); | |
4836 foreach my $path (@micros){ | |
4837 open(READ, "<$path") or die "Cannot open file $path :$!"; | |
4838 my %single_hash = (); | |
4839 my $key = (); | |
4840 my $counter = 0; | |
4841 while (my $line = <READ>){ | |
4842 $counter++; | |
4843 # print $line; | |
4844 chomp $line; | |
4845 my @fields1 = split(/\t/,$line); | |
4846 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { | |
4847 $key = join("\t",$1, $2, $4, $5); | |
4848 | |
4849 # print "key = : $key\n" if $prinkter == 1; | |
4850 | |
4851 # print $line if $prinkter == 1; | |
4852 push (@{$single_hash{$key}},$line); | |
4853 } | |
4854 else{ | |
4855 # print "microsat line incompatible\n"; | |
4856 } | |
4857 } | |
4858 push @hasharr, {%single_hash}; | |
4859 # print "@{$single_hash{$key}} \n"; | |
4860 # print "done $path: counter = $counter\n" if $prinkter == 1; | |
4861 close READ; | |
4862 } | |
4863 # print "Done hashes\n"; | |
4864 #---------------------------------------------------------------------------------------------------------------- | |
4865 my $question=(); | |
4866 #---------------------------------------------------------------------------------------------------------------- | |
4867 my @contigstarts = (); | |
4868 my @contigends = (); | |
4869 | |
4870 my %contigclusters = (); | |
4871 my %contigclustersFirstStartOnly=(); | |
4872 my %contigclustersLastEndOnly=(); | |
4873 my %contigclustersLastEndLengthOnly=(); | |
4874 my %contigclustersFirstStartLengthOnly=(); | |
4875 my %contigpath=(); | |
4876 my $dotcounter = 0; | |
4877 while (my $line = <BO>){ | |
4878 # print "x" x 60, "\n" if $prinkter == 1; | |
4879 $dotcounter++; | |
4880 # print "." if $dotcounter % 100 ==0; | |
4881 # print "\n" if $dotcounter % 5000 ==0; | |
4882 next if $line !~ /^[0-9]+/; | |
4883 # print $line if $prinkter == 1; | |
4884 chomp $line; | |
4885 my @fields2 = split(/\t/,$line); | |
4886 my $key2 = (); | |
4887 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { | |
4888 $key2 = join("\t",$1, $2, $4, $5); | |
4889 } | |
4890 else { | |
4891 # print "seq line $line incompatible\n" if $prinkter == 1; | |
4892 next;} | |
4893 | |
4894 | |
4895 | |
4896 | |
4897 | |
4898 | |
4899 my @sequences = (); | |
4900 for (0 ... $#tags){ | |
4901 my $seq = <BO>; | |
4902 # print $seq; | |
4903 chomp $seq; | |
4904 push(@sequences , " ".$seq); | |
4905 } | |
4906 my @origsequences = @sequences; | |
4907 my $seqcopy = $sequences[0]; | |
4908 my @strings = (); | |
4909 $seqcopy =~ s/[a-zA-Z]|-/x/g; | |
4910 my @string = split(/\s*/,$seqcopy); | |
4911 | |
4912 for my $s (0 ... $#tags){ | |
4913 $sequences[$s] =~ s/-//g; | |
4914 $sequences[$s] =~ s/[a-zA-Z]/x/g; | |
4915 # print "length of sequence = ",length($sequences[$s]),"\n"; | |
4916 my @tempstring = split(/\s*/,$sequences[$s]); | |
4917 push(@strings, [@tempstring]) | |
4918 | |
4919 } | |
4920 | |
4921 my @species_list = (); | |
4922 my @micro_count = 0; | |
4923 my @starthash = (); | |
4924 my $stopper = 1; | |
4925 my @endhash = (); | |
4926 | |
4927 my @currentcontigstarts=(); | |
4928 my @currentcontigends=(); | |
4929 my @currentcontigchrs=(); | |
4930 | |
4931 for my $i (0 ... $#tags){ | |
4932 # print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1; | |
4933 my @temparr = (); | |
4934 | |
4935 if (exists $hasharr[$i]{$key2}){ | |
4936 @temparr = @{$hasharr[$i]{$key2}}; | |
4937 | |
4938 $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/; | |
4939 ## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; | |
4940 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; | |
4941 my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; | |
4942 my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; | |
4943 $contigstarts[$i]{$startkey}= $key2; | |
4944 $contigends[$i]{$endkey}= $key2; | |
4945 # print "confirming existance: \n" if $prinkter == 1; | |
4946 # print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1; | |
4947 # print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1; | |
4948 $currentcontigchrs[$i]=$1; | |
4949 $currentcontigstarts[$i]=$2; | |
4950 $currentcontigends[$i]=$3; | |
4951 | |
4952 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} | |
4953 else { | |
4954 push (@starthash, {0 => "0"}); | |
4955 push (@endhash, {0 => "0"}); | |
4956 $currentcontigchrs[$i] = 0; | |
4957 next; | |
4958 } | |
4959 $stopper = 0; | |
4960 # print "exists: @temparr\n" if $prinkter == 1; | |
4961 push(@micro_count, scalar(@temparr)); | |
4962 push(@species_list, [@temparr]); | |
4963 my @tempstart = (); my @tempend = (); | |
4964 my %localends = (); | |
4965 my %localhash = (); | |
4966 # print "---------------------------\n"; | |
4967 | |
4968 foreach my $templine (@temparr){ | |
4969 # print "templine = $templine\n" if $prinkter == 1; | |
4970 my @tields = split(/\t/,$templine); | |
4971 my $start = $tields[$startcord]; # - $tields[$gapcord]; | |
4972 my $end = $tields[$endcord]; #- $tields[$gapcord]; | |
4973 my $realstart = $tields[$startcord]- $tields[$gapcord]; | |
4974 my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g); | |
4975 $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/; | |
4976 # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n"; | |
4977 my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat; | |
4978 # print "real start = $realstart, realend = $realend \n"; | |
4979 for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;} | |
4980 push(@tempstart, $start); | |
4981 push(@tempend, $end); | |
4982 $localhash{$start."-".$end} = $templine; | |
4983 } | |
4984 push @starthash, {%localhash}; | |
4985 my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST); | |
4986 | |
4987 # print "foundclusters = $foundclusters\n"; | |
4988 | |
4989 my @clusters = split(/_/,$foundclusters); | |
4990 | |
4991 my $clustno = 0; | |
4992 | |
4993 foreach my $cluster (@clusters) { | |
4994 my @constituenst = split(/,/,$cluster); | |
4995 # print "clusters returned: @constituenst\n" if $prinkter == 1; | |
4996 } | |
4997 | |
4998 @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters)); | |
4999 | |
5000 | |
5001 } | |
5002 next if $stopper == 1; | |
5003 | |
5004 # print colored ['blue'],"FINAL:\n" if $prinkter == 1; | |
5005 my $finalclusters =findClusters(join("!",@string), 1); | |
5006 # print "finalclusters = $finalclusters\n"; | |
5007 # print colored ['blue'],"----------------------\n" if $prinkter == 1; | |
5008 my @clusters = split(/,/,$finalclusters); | |
5009 # print "@string\n" if $prinkter == 1; | |
5010 # print "@clusters\n" if $prinkter == 1; | |
5011 # print "------------------------------------------------------------------\n" if $prinkter == 1; | |
5012 | |
5013 my $clustno = 0; | |
5014 | |
5015 # foreach my $cluster (@clusters) { | |
5016 # my @constituenst = split(/,/,$cluster); | |
5017 # print "clusters returned: @constituenst\n"; | |
5018 # } | |
5019 | |
5020 next if (scalar @clusters == 0); | |
5021 | |
5022 my @contigcluster=(); | |
5023 my $clusterno=0; | |
5024 my @contigClusterstarts=(); | |
5025 my @contigClusterends = (); | |
5026 | |
5027 foreach my $clust (@clusters){ | |
5028 # print "cluster: $clust\n"; | |
5029 $clusterno++; | |
5030 my @localclust = split(/\./, $clust); | |
5031 my @result = (); | |
5032 my @starts = (); | |
5033 my @ends = (); | |
5034 | |
5035 for my $i (0 ... $#localclust){ | |
5036 # print "localclust[$i]: $localclust[$i]\n"; | |
5037 my @pattern = split(/:/, $localclust[$i]); | |
5038 my @cords = split(/-/, $pattern[1]); | |
5039 push (@starts, $cords[0]); | |
5040 push (@ends, $cords[1]); | |
5041 } | |
5042 | |
5043 my $extremestart = smallest_number(@starts); | |
5044 my $extremeend = largest_number(@ends); | |
5045 push(@contigClusterstarts, $extremestart); | |
5046 push(@contigClusterends, $extremeend); | |
5047 # print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ; | |
5048 | |
5049 foreach my $clustparts (@localclust){ | |
5050 my @pattern = split(/:/, $clustparts); | |
5051 # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n"; | |
5052 push (@result, $starthash[$pattern[0]]{$pattern[1]}); | |
5053 } | |
5054 push(@contigcluster, join("\t", @result)); | |
5055 # print join("\t", @result),"<-result \n" if $prinkter == 1 ; | |
5056 } | |
5057 | |
5058 | |
5059 my $firstclusterstart = smallest_number(@contigClusterstarts); | |
5060 my $lastclusterend = largest_number(@contigClusterends); | |
5061 | |
5062 | |
5063 $contigclustersFirstStartOnly{$key2}=$firstclusterstart; | |
5064 $contigclustersLastEndOnly{$key2} = $lastclusterend; | |
5065 $contigclusters{$key2}=[ @contigcluster ]; | |
5066 # print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1; | |
5067 for my $i (0 ... $#tags){ | |
5068 #1 check if there exists adjacent alignment block wrt coordinates of this species. | |
5069 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. | |
5070 #2 no need to worry about proximity of anything in adjacent block! | |
5071 | |
5072 #1 BELOW, the following is really to calclate the distance between the end coordinate of the | |
5073 #2 cluster and the end of the gap-free sequence of each species. this is so that if an | |
5074 #3 adjacent alignment block is found lateron, the exact distance between the potentially | |
5075 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used | |
5076 #5 immediately below. | |
5077 # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1; | |
5078 | |
5079 my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart); | |
5080 my $species_endsubstring = (); | |
5081 | |
5082 if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";} | |
5083 else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);} | |
5084 | |
5085 # print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1; | |
5086 # print "for species: $tags[$i]: \n" if $prinkter == 1; | |
5087 | |
5088 $species_startsubstring =~ s/-| //g; | |
5089 $species_endsubstring =~ s/-| //g; | |
5090 $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring); | |
5091 $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring); | |
5092 | |
5093 | |
5094 | |
5095 # print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1; | |
5096 # print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1; | |
5097 # print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1; | |
5098 | |
5099 # print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1; | |
5100 | |
5101 } | |
5102 | |
5103 | |
5104 } | |
5105 # print "\ndone the job of filling... \n"; | |
5106 #/////////////////////////////////////////////////////////////////////////////////////// | |
5107 #/////////////////////////////////////////////////////////////////////////////////////// | |
5108 #/////////////////////////////////////////////////////////////////////////////////////// | |
5109 #/////////////////////////////////////////////////////////////////////////////////////// | |
5110 $prinkter=0; | |
5111 open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; | |
5112 | |
5113 my %clusteringpaths=(); | |
5114 my %clustersholder=(); | |
5115 my %foundkeys=(); | |
5116 my %clusteringpathsRev=(); | |
5117 | |
5118 | |
5119 my $totalcount=(); | |
5120 my $founkeys_enteredcount=(); | |
5121 my $transfered=0; | |
5122 my $complete_transfered=0; | |
5123 my $plain_transfered=0; | |
5124 my $existing_removed=0; | |
5125 | |
5126 while (my $line = <BO>){ | |
5127 # print "x" x 60, "\n" if $prinkter == 1; | |
5128 next if $line !~ /^[0-9]+/; | |
5129 #print $line; | |
5130 chomp $line; | |
5131 my @fields2 = split(/\t/,$line); | |
5132 my $key2 = (); | |
5133 if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) { | |
5134 $key2 = join("\t",$1, $2, $4, $5); | |
5135 } | |
5136 | |
5137 else { | |
5138 # print "seq line $line incompatible\n"; | |
5139 next; | |
5140 } | |
5141 # print "KEY = : $key2\n" if $prinkter == 1; | |
5142 | |
5143 | |
5144 my @currentcontigstarts=(); | |
5145 my @currentcontigends=(); | |
5146 my @currentcontigchrs=(); | |
5147 my @clusters = (); | |
5148 my @clusterscopy=(); | |
5149 if (exists $contigclusters{$key2}){ | |
5150 @clusters = @{$contigclusters{$key2}}; | |
5151 @clusterscopy=@clusters; | |
5152 for my $i (0 ... $#tags){ | |
5153 # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; | |
5154 if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){ | |
5155 # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; | |
5156 my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; | |
5157 my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; | |
5158 $currentcontigchrs[$i]=$1; | |
5159 $currentcontigstarts[$i]=$2; | |
5160 $currentcontigends[$i]=$3; | |
5161 } | |
5162 else { | |
5163 $currentcontigchrs[$i] = 0; | |
5164 # print "no microsat clusters for $key2\n" if $prinkter == 1; next; | |
5165 } | |
5166 } | |
5167 } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} | |
5168 | |
5169 my @sequences = (); | |
5170 for (0 ... $#tags){ | |
5171 my $seq = <BO>; | |
5172 # print $seq; | |
5173 chomp $seq; | |
5174 push(@sequences , " ".$seq); | |
5175 } | |
5176 | |
5177 next if scalar @currentcontigchrs == 0; | |
5178 | |
5179 # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1; | |
5180 my %visitedcontigs=(); | |
5181 | |
5182 for my $i (0 ... $#tags){ | |
5183 #1 check if there exists adjacent alignment block wrt coordinates of this species. | |
5184 next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. | |
5185 #2 no need to worry about proximity of anything in adjacent block! | |
5186 @clusters=@clusterscopy; | |
5187 #1 BELOW, the following is really to calclate the distance between the end coordinate of the | |
5188 #2 cluster and the end of the gap-free sequence of each species. this is so that if an | |
5189 #3 adjacent alignment block is found lateron, the exact distance between the potentially | |
5190 #4 adjacent microsat clusters can be found here. the exact start coordinate will be used | |
5191 #5 immediately below. | |
5192 my $firstclusterstart = $contigclustersFirstStartOnly{$key2}; | |
5193 my $lastclusterend = $contigclustersLastEndOnly{$key2}; | |
5194 | |
5195 my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]); | |
5196 # print "check if exists $key3 in contigends for $i\n" if $prinkter == 1; | |
5197 | |
5198 if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){ | |
5199 $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies# | |
5200 # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1; | |
5201 | |
5202 #1 extract coordinates of the last cluster of this found alignment block | |
5203 # print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1; | |
5204 # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1; | |
5205 # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; | |
5206 # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; | |
5207 my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}}; | |
5208 my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}}; | |
5209 my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=(); | |
5210 | |
5211 for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){ | |
5212 # print "val = $val\n" if $prinkter == 1; | |
5213 if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){ | |
5214 push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/; | |
5215 } | |
5216 } | |
5217 # print "-----\n" if $prinkter == 1; | |
5218 for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){ | |
5219 # print "val = $val\n" if $prinkter == 1; | |
5220 if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){ | |
5221 push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/; | |
5222 } | |
5223 } | |
5224 | |
5225 | |
5226 # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1; | |
5227 # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1; | |
5228 | |
5229 # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){ | |
5230 if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){ | |
5231 my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}}; | |
5232 $regurgitate[$#regurgitate]=~s/\n//g; | |
5233 $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters); | |
5234 delete $contigclusters{$contigends[$i]{$key3}}; | |
5235 $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ]; | |
5236 delete $contigclusters{$key2}; | |
5237 $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0; | |
5238 $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0; | |
5239 | |
5240 if (scalar(@clusters) < 1){ | |
5241 # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2}; | |
5242 $clusteringpaths{$key2}=$contigends[$i]{$key3}; | |
5243 $clusteringpathsRev{$contigends[$i]{$key3}}=$key2; | |
5244 print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n"; | |
5245 # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n"; | |
5246 $founkeys_enteredcount-- if exists $foundkeys{$key2}; | |
5247 $existing_removed++ if exists $foundkeys{$key2}; | |
5248 # print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1; | |
5249 delete $foundkeys{$key2} if exists $foundkeys{$key2}; | |
5250 $complete_transfered++; | |
5251 } | |
5252 else{ | |
5253 print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2}; | |
5254 $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2}; | |
5255 $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2}; | |
5256 | |
5257 $founkeys_enteredcount++ if !exists $foundkeys{$key2}; | |
5258 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2}; | |
5259 # print "adding foundkeys entry $foundkeys{$key2}\n"; | |
5260 $transfered++; | |
5261 } | |
5262 #$contigclusters{$key2}=[ @contigcluster ]; | |
5263 } | |
5264 } | |
5265 else{ | |
5266 # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1; | |
5267 $plain_transfered++; | |
5268 print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; | |
5269 $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2}; | |
5270 $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; | |
5271 $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; | |
5272 $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; | |
5273 # print "adding foundkeys entry $foundkeys{$key2}\n"; | |
5274 | |
5275 } | |
5276 $totalcount++; | |
5277 | |
5278 } | |
5279 | |
5280 | |
5281 } | |
5282 close BO; | |
5283 #close (NORTH); | |
5284 #/////////////////////////////////////////////////////////////////////////////////////// | |
5285 #/////////////////////////////////////////////////////////////////////////////////////// | |
5286 #/////////////////////////////////////////////////////////////////////////////////////// | |
5287 #/////////////////////////////////////////////////////////////////////////////////////// | |
5288 | |
5289 my $founkeys_count=(); | |
5290 my $nopath_count=(); | |
5291 my $pathed_count=0; | |
5292 foreach my $key2 (keys %foundkeys){ | |
5293 #print "x" x 60, "\n"; | |
5294 # print "x" if $dotcounter % 100 ==0; | |
5295 # print "\n" if $dotcounter % 5000 ==0; | |
5296 $founkeys_count++; | |
5297 my $key = $key2; | |
5298 # print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1; | |
5299 if ($clusteringpaths{$key} eq $key){ | |
5300 # print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1; | |
5301 $nopath_count++; | |
5302 delete $foundkeys{$key2}; | |
5303 print ORTH join ("\n",@{$contigclusters{$key2}}),"\n"; | |
5304 } | |
5305 else{ | |
5306 my @pool=(); | |
5307 my $key3=(); | |
5308 $pathed_count++; | |
5309 # print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1; | |
5310 # print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1; | |
5311 if ($clusteringpathsRev{$key} eq "0") { | |
5312 next; | |
5313 } | |
5314 else{ | |
5315 my $yek3 = $clusteringpathsRev{$key}; | |
5316 my $yek = $key; | |
5317 # print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1; | |
5318 while ($yek3 ne "0"){ | |
5319 # print "$yek->$yek3," if $prinkter == 1; | |
5320 $yek = $yek3; | |
5321 $yek3 = $clusteringpathsRev{$yek}; | |
5322 } | |
5323 # print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1; | |
5324 $key3 = $clusteringpaths{$yek}; | |
5325 $key = $yek; | |
5326 } | |
5327 | |
5328 # print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1; | |
5329 | |
5330 while($key ne $key3){ | |
5331 # print "KEEY $key->$key3\n" if $prinkter == 1; | |
5332 # print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1; | |
5333 | |
5334 if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}}; | |
5335 # print "now pool = @pool\n" if $prinkter == 1; | |
5336 } | |
5337 delete $foundkeys{$key3}; | |
5338 $key=$key3; | |
5339 $key3=$clusteringpaths{$key}; | |
5340 } | |
5341 # print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1; | |
5342 my @firstcontig= @{$contigclusters{$key}}; | |
5343 delete $foundkeys{$key2} if exists $foundkeys{$key2} ; | |
5344 delete $foundkeys{$key} if exists $foundkeys{$key}; | |
5345 | |
5346 unshift @pool, pop @firstcontig; | |
5347 # print join("\t",@pool),"\n" if $prinkter == 1; | |
5348 print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0; | |
5349 print ORTH join ("\t",@pool),"\n"; | |
5350 # join(); | |
5351 } | |
5352 | |
5353 } | |
5354 #close (NORTH); | |
5355 # print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1; | |
5356 close (BO); | |
5357 close (ORTH); | |
5358 close (OUTP); | |
5359 return 1; | |
5360 | |
5361 } | |
5362 sub stringPainter{ | |
5363 my @string = split(/_C0C_/,$_[0]); | |
5364 # print $_[0], " <- in stringPainter\n"; | |
5365 # print $_[1], " <- in clusters\n"; | |
5366 | |
5367 my @clusters = split(/,/, $_[1]); | |
5368 for my $i (0 ... $#clusters){ | |
5369 my $cluster = $clusters[$i]; | |
5370 # print "cluster = $cluster\n"; | |
5371 my @parts = split(/\./,$cluster); | |
5372 my @cord = split(/:|-/,shift(@parts)); | |
5373 my $minstart = $cord[1]; | |
5374 my $maxend = $cord[2]; | |
5375 # print "minstart = $minstart , maxend = $maxend\n"; | |
5376 | |
5377 for my $j (0 ... $#parts){ | |
5378 # print "oing thri $parts[$j]\n"; | |
5379 my @cord = split(/:|-/,$parts[$j]); | |
5380 $minstart = $cord[1] if $cord[1] < $minstart; | |
5381 $maxend = $cord[2] if $cord[2] > $maxend; | |
5382 } | |
5383 # print "minstart = $minstart , maxend = $maxend\n"; | |
5384 for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;} | |
5385 | |
5386 | |
5387 } | |
5388 # print "@string <-done from function stringPainter\n"; | |
5389 return join("_S0S_",@string); | |
5390 } | |
5391 | |
5392 sub findClusters{ | |
5393 my $continue = 0; | |
5394 my @mapped_clusters = (); | |
5395 my $clusterdist = $_[1]; | |
5396 my $previous = 'x'; | |
5397 my @localcluster = (); | |
5398 my $cluster_starts = (); | |
5399 my $cluster_ends = (); | |
5400 my $localcluster_start = (); | |
5401 my $localcluster_end = (); | |
5402 my @record_cluster = (); | |
5403 my @string = split(/\!/, $_[0]); | |
5404 my $zerolength=0; | |
5405 | |
5406 for my $pos_pos (1 ... $#string){ | |
5407 my $pos = $string[$pos_pos]; | |
5408 # print $pos, "\n"; | |
5409 if ($continue == 0 && $pos eq "x") {next;} | |
5410 | |
5411 if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){ | |
5412 if ($zerolength == 0) {$localcluster_end = $pos_pos-1}; | |
5413 $zerolength++; | |
5414 $continue = 1; | |
5415 } | |
5416 | |
5417 if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) { | |
5418 $zerolength = 0; | |
5419 $continue = 0; | |
5420 my %seen; | |
5421 my @uniqed = grep !$seen{$_}++, @localcluster; | |
5422 # print "caught cluster : @uniqed \n"; | |
5423 push(@mapped_clusters, [@uniqed]); | |
5424 # print "clustered:\n@uniqed\n"; | |
5425 @localcluster = (); | |
5426 @record_cluster = (); | |
5427 | |
5428 } | |
5429 | |
5430 if ($pos ne "x"){ | |
5431 $zerolength = 0; | |
5432 $continue = 1; | |
5433 $pos =~ s/x,//g; | |
5434 my @entries = split(/,/,$pos); | |
5435 $localcluster_end = 0; | |
5436 $localcluster_start = 0; | |
5437 push(@record_cluster,$pos); | |
5438 | |
5439 if ($continue == 0){ | |
5440 @localcluster = (); | |
5441 @localcluster = (@localcluster, @entries); | |
5442 $localcluster_start = $pos_pos; | |
5443 } | |
5444 | |
5445 if ($continue == 1 ) { | |
5446 @localcluster = (@localcluster, @entries); | |
5447 } | |
5448 } | |
5449 } | |
5450 | |
5451 if (scalar(@localcluster) > 0){ | |
5452 my %seen; | |
5453 my @uniqed = grep !$seen{$_}++, @localcluster; | |
5454 # print "caught cluster : @uniqed \n"; | |
5455 push(@mapped_clusters, [@uniqed]); | |
5456 # print "clustered:\n@uniqed\n"; | |
5457 @localcluster = (); | |
5458 @record_cluster = (); | |
5459 } | |
5460 | |
5461 my @returner = (); | |
5462 | |
5463 foreach my $clust (@mapped_clusters){ | |
5464 my @localclust = @$clust; | |
5465 my @result = (); | |
5466 foreach my $clustparts (@localclust){ | |
5467 push(@result,$clustparts); | |
5468 } | |
5469 push(@returner , join(".",@result)); | |
5470 } | |
5471 # print "returnig: ", join(",",@returner), "\n"; | |
5472 return join(",",@returner); | |
5473 } | |
5474 #xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx | |
5475 | |
5476 #xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |
5477 | |
5478 sub MakeTrees{ | |
5479 my $tree = $_[0]; | |
5480 my @parts=($tree); | |
5481 # my @parts=(); | |
5482 | |
5483 while (1){ | |
5484 $tree =~ s/^\(//g; | |
5485 $tree =~ s/\)$//g; | |
5486 my @arr = (); | |
5487 | |
5488 if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ | |
5489 @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; | |
5490 $tree = $2; | |
5491 push @parts, $tree; | |
5492 } | |
5493 elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ | |
5494 @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; | |
5495 $tree = $1; | |
5496 push @parts, $tree; | |
5497 } | |
5498 elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ | |
5499 last; | |
5500 } | |
5501 } | |
5502 return @parts; | |
5503 } | |
5504 | |
5505 #xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx | |
5506 | |
5507 sub qualityFilter{ | |
5508 my $unmaskedorthfile = $_[0]; | |
5509 my $seqfile = $_[1]; | |
5510 my $maskedorthfile = $_[2]; | |
5511 my $filteredout = $maskedorthfile."_residue"; | |
5512 open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!"; | |
5513 | |
5514 my %keyhash = (); | |
5515 | |
5516 while (my $line = <PMORTH>){ | |
5517 my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/; | |
5518 push @{$keyhash{$key}}, $line; | |
5519 } | |
5520 | |
5521 open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!"; | |
5522 open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!"; | |
5523 open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!"; | |
5524 | |
5525 | |
5526 | |
5527 while (my $line = <SEQ>){ | |
5528 chomp $line; | |
5529 if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){ | |
5530 my $key = join("\t", $1,$2,$3,$4); | |
5531 next if !exists $keyhash{$key}; | |
5532 my @orths = @{$keyhash{$key}} if exists $keyhash{$key}; | |
5533 delete $keyhash{$key}; | |
5534 | |
5535 my $sine = <SEQ>; | |
5536 | |
5537 foreach my $orth (@orths){ | |
5538 #print "-----------------------------------------------------------------\n"; | |
5539 #print $orth; | |
5540 my $orthcopy = $orth; | |
5541 $orth =~ s/^>//; | |
5542 my @parts = split(/>/,$orth); | |
5543 | |
5544 my @starts = (); | |
5545 my @ends = (); | |
5546 | |
5547 foreach my $part (@parts){ | |
5548 my $no_of_species = adjustCoordinates($part); | |
5549 my @pields = split(/\t/,$part); | |
5550 | |
5551 # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n"; | |
5552 | |
5553 push @starts, $pields[$startcord]; | |
5554 push @ends, $pields[$endcord]; | |
5555 } | |
5556 | |
5557 #print "starts = @starts ... ends = @ends\n"; | |
5558 | |
5559 my $leftend = smallest_number(@starts)-10; | |
5560 my $rightend = largest_number(@ends)+10; | |
5561 | |
5562 my $maskarea = substr($sine, $leftend, $rightend-$leftend+1); | |
5563 print RES $orth if $maskarea =~ /#/; | |
5564 | |
5565 | |
5566 next if $maskarea =~ /#/; | |
5567 | |
5568 print MORTH $orthcopy; | |
5569 } | |
5570 } | |
5571 else{ | |
5572 next; | |
5573 } | |
5574 | |
5575 | |
5576 } | |
5577 | |
5578 # print "UNDONE: ", scalar(keys %keyhash),"\n"; | |
5579 # print MORTH "UNDONE: ", scalar(keys %keyhash),"\n"; | |
5580 | |
5581 } | |
5582 | |
5583 sub adjustCoordinates{ | |
5584 my $line = $_[0]; | |
5585 my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig; | |
5586 my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g); | |
5587 # print "line = $line\n"; | |
5588 $infocord = 2 + (4*$no_of_species) - 1; | |
5589 $typecord = 2 + (4*$no_of_species) + 1 - 1; | |
5590 $motifcord = 2 + (4*$no_of_species) + 2 - 1; | |
5591 $gapcord = $motifcord+1; | |
5592 $startcord = $gapcord+1; | |
5593 $strandcord = $startcord+1; | |
5594 $endcord = $strandcord + 1; | |
5595 $microsatcord = $endcord + 1; | |
5596 $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; | |
5597 $interr_poscord = $microsatcord + 3; | |
5598 $no_of_interruptionscord = $microsatcord + 4; | |
5599 $interrcord = $microsatcord + 2; | |
5600 # print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i; | |
5601 return $no_of_species; | |
5602 } | |
5603 | |
5604 | |
5605 | |
5606 |