Mercurial > repos > devteam > indels_3way
comparison parseMAF_smallIndels.pl @ 0:5ad24b81dd10 draft default tip
Uploaded tool tarball.
author | devteam |
---|---|
date | Wed, 25 Sep 2013 11:24:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5ad24b81dd10 |
---|---|
1 #!/usr/bin/perl -w | |
2 # a program to get indels | |
3 # input is a MAF format 3-way alignment file | |
4 # from 3-way blocks only at this time | |
5 # translate seq2, seq3, etc coordinates to + if align orient is reverse complement | |
6 | |
7 use strict; | |
8 use warnings; | |
9 | |
10 # declare and initialize variables | |
11 my $fh; # variable to store filehandle | |
12 my $record; | |
13 my $offset; | |
14 my $library = $ARGV[0]; | |
15 my $count = 0; | |
16 my $count2 = 0; | |
17 my $count3 = 0; | |
18 my $count4 = 0; | |
19 my $start1 = my $start2 = my $start3 = my $start4 = my $start5 = my $start6 = 0; | |
20 my $orient = ""; | |
21 my $outgroup = $ARGV[2]; | |
22 my $ingroup1 = my $ingroup2 = ""; | |
23 my $count_seq1insert = my $count_seq1delete = 0; | |
24 my $count_seq2insert = my $count_seq2delete = 0; | |
25 my $count_seq3insert = my $count_seq3delete = 0; | |
26 my @seq1_insert_lengths = my @seq1_delete_lengths = (); | |
27 my @seq2_insert_lengths = my @seq2_delete_lengths = (); | |
28 my @seq3_insert_lengths = my @seq3_delete_lengths = (); | |
29 my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); | |
30 my @seq1_insert_startOnly = my @seq1_delete_startOnly = my @seq2_insert_startOnly = my @seq2_delete_startOnly = (); | |
31 my @seq3_insert_startOnly = my @seq3_delete_startOnly = (); | |
32 my @indels = (); | |
33 | |
34 # check to make sure correct files | |
35 my $usage = "usage: parseMAF_smallIndels.pl [MAF.in] [small_Indels_summary.out] [outgroup]\n"; | |
36 die $usage unless @ARGV == 3; | |
37 | |
38 # perform some standard subroutines | |
39 $fh = open_file($library); | |
40 | |
41 $offset = tell($fh); | |
42 | |
43 #my $ofile = $ARGV[2]; | |
44 #unless (open(OFILE, ">$ofile")){ | |
45 # print "Cannot open output file \"$ofile\"\n\n"; | |
46 # exit; | |
47 #} | |
48 | |
49 my $ofile2 = $ARGV[1]; | |
50 unless (open(OFILE2, ">$ofile2")){ | |
51 print "Cannot open output file \"$ofile2\"\n\n"; | |
52 exit; | |
53 } | |
54 | |
55 | |
56 # header line for output files | |
57 #print OFILE "# small indel events, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; | |
58 #print OFILE "#align\tingroup1\tingroup1_coord\tingroup1_orient\tingroup2\tingroup2_coord\tingroup2_orient\toutgroup\toutgroup_coord\toutgroup_orient\tindel_type\n"; | |
59 | |
60 #print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; | |
61 print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_alignSize\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_alignSize\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_alignSize\toutgroup_orient\n"; | |
62 | |
63 # main body of program | |
64 while ($record = get_next_record($fh) ){ | |
65 if ($record=~ m/\s*##maf(.*)\s*# maf/s){ | |
66 next; | |
67 } | |
68 | |
69 my @sequences = get_sequences_within_block($record); | |
70 my @seq_info = get_indels_within_block(@sequences); | |
71 get_indels_lengths(@seq_info); | |
72 | |
73 $offset = tell($fh); | |
74 $count++; | |
75 | |
76 } | |
77 | |
78 get_starts_only(@seq1_insert); | |
79 get_starts_only(@seq1_delete); | |
80 get_starts_only(@seq2_insert); | |
81 get_starts_only(@seq2_delete); | |
82 get_starts_only(@seq3_insert); | |
83 get_starts_only(@seq3_delete); | |
84 | |
85 # print some things to keep track of progress | |
86 #print "# $library\n"; | |
87 #print "# number of records = $count\n"; | |
88 #print "# number of sequence \"s\" lines = $count2\n"; | |
89 if ($count3 != 0){ | |
90 print "Skipped $count3 blocks with only 2 seqs;\n"; | |
91 } | |
92 #print "# number of records with only h-m = $count4\n\n"; | |
93 | |
94 print "Ingroup1 = $ingroup1; Ingroup2 = $ingroup2; Outgroup = $outgroup;\n"; | |
95 print "# of ingroup1 inserts = $count_seq1insert;\n"; | |
96 print "# of ingroup1 deletes = $count_seq1delete;\n"; | |
97 print "# of ingroup2 inserts = $count_seq2insert;\n"; | |
98 print "# of ingroup2 deletes = $count_seq2delete;\n"; | |
99 print "# of outgroup3 inserts = $count_seq3insert;\n"; | |
100 print "# of outgroup3 deletes = $count_seq3delete\n"; | |
101 | |
102 | |
103 #close OFILE; | |
104 | |
105 if ($count == $count3){ | |
106 print STDERR "Skipped all blocks since none of them contain 3-way alignments.\n"; | |
107 exit -1; | |
108 } | |
109 | |
110 ###################SUBROUTINES##################################### | |
111 | |
112 # subroutine to open file | |
113 sub open_file { | |
114 my($filename) = @_; | |
115 my $fh; | |
116 | |
117 unless (open($fh, $filename)){ | |
118 print "Cannot open file $filename\n"; | |
119 exit; | |
120 } | |
121 return $fh; | |
122 } | |
123 | |
124 # get next record | |
125 sub get_next_record { | |
126 my ($fh) = @_; | |
127 my ($offset); | |
128 my ($record) = ""; | |
129 my ($save_input_separator) = $/; | |
130 | |
131 $/ = "a score"; | |
132 | |
133 $record = <$fh>; | |
134 | |
135 $/ = $save_input_separator; | |
136 return $record; | |
137 } | |
138 | |
139 # get header info | |
140 sub get_sequences_within_block{ | |
141 my (@alignment) = @_; | |
142 my @lines = (); | |
143 | |
144 my @sequences = (); | |
145 | |
146 @lines = split ("\n", $record); | |
147 foreach (@lines){ | |
148 chomp($_); | |
149 if (m/^\s*$/){ | |
150 next; | |
151 } | |
152 elsif (m/^\s*=(\d+\.*\d*)/){ | |
153 | |
154 }elsif (m/^\s*s(.*)$/){ | |
155 $count2++; | |
156 | |
157 push (@sequences,$_); | |
158 } | |
159 } | |
160 return @sequences; | |
161 } | |
162 | |
163 sub get_indels_within_block{ | |
164 my (@sequences) = @_; | |
165 my $line1 = my $line2 = my $line3 = ""; | |
166 my @line1 = my @line2 = my @line3 = (); | |
167 my $score = 0; | |
168 my $start1 = my $align_length1 = my $end1 = my $seq_length1 = 0; | |
169 my $start2 = my $align_length2 = my $end2 = my $seq_length2 = 0; | |
170 my $start3 = my $align_length3 = my $end3 = my $seq_length3 = 0; | |
171 my $seq1 = my $orient1 = ""; | |
172 my $seq2 = my $orient2 = ""; | |
173 my $seq3 = my $orient3 = ""; | |
174 my $start1_plus = my $end1_plus = 0; | |
175 my $start2_plus = my $end2_plus = 0; | |
176 my $start3_plus = my $end3_plus = 0; | |
177 my @test = (); | |
178 my $test = ""; | |
179 my $header = ""; | |
180 my @header = (); | |
181 my $sequence1 = my $sequence2 = my $sequence3 =""; | |
182 my @array_return = (); | |
183 my $test1 = 0; | |
184 my $line1_stat = my $line2_stat = my $line3_stat = ""; | |
185 | |
186 # process 3-way blocks only | |
187 if (scalar(@sequences) == 3){ | |
188 $line1 = $sequences[0]; | |
189 chomp ($line1); | |
190 $line2 = $sequences[1]; | |
191 chomp ($line2); | |
192 $line3 = $sequences[2]; | |
193 chomp ($line3); | |
194 # check order of sequences and assign uniformly seq1= human, seq2 = chimp, seq3 = macaque | |
195 if ($line1 =~ m/$outgroup/){ | |
196 $line1_stat = "out"; | |
197 $line2=~ s/^\s*//; | |
198 $line2 =~ s/\s+/\t/g; | |
199 @line2 = split(/\t/, $line2); | |
200 if (($ingroup1 eq "") || ($line2[1] =~ m/$ingroup1/)){ | |
201 $line2_stat = "in1"; | |
202 $line3_stat = "in2"; | |
203 } | |
204 else{ | |
205 $line3_stat = "in1"; | |
206 $line2_stat = "in2"; } | |
207 } | |
208 elsif ($line2 =~ m/$outgroup/){ | |
209 $line2_stat = "out"; | |
210 $line1=~ s/^\s*//; | |
211 $line1 =~ s/\s+/\t/g; | |
212 @line1 = split(/\t/, $line1); | |
213 if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ | |
214 $line1_stat = "in1"; | |
215 $line3_stat = "in2"; | |
216 } | |
217 else{ | |
218 $line3_stat = "in1"; | |
219 $line1_stat = "in2"; } | |
220 } | |
221 elsif ($line3 =~ m/$outgroup/){ | |
222 $line3_stat = "out"; | |
223 $line1=~ s/^\s*//; | |
224 $line1 =~ s/\s+/\t/g; | |
225 @line1 = split(/\t/, $line1); | |
226 if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ | |
227 $line1_stat = "in1"; | |
228 $line2_stat = "in2"; | |
229 } | |
230 else{ | |
231 $line2_stat = "in1"; | |
232 $line1_stat = "in2"; } | |
233 } | |
234 | |
235 #print "# l1 = $line1_stat\n"; | |
236 #print "# l2 = $line2_stat\n"; | |
237 #print "# l3 = $line3_stat\n"; | |
238 my $linei1 = my $linei2 = my $lineo = ""; | |
239 my @linei1 = my @linei2 = my @lineo = (); | |
240 | |
241 if ($line1_stat eq "out"){ | |
242 $lineo = $line1; | |
243 } | |
244 elsif ($line1_stat eq "in1"){ | |
245 $linei1 = $line1; | |
246 } | |
247 else{ | |
248 $linei2 = $line1; | |
249 } | |
250 | |
251 if ($line2_stat eq "out"){ | |
252 $lineo = $line2; | |
253 } | |
254 elsif ($line2_stat eq "in1"){ | |
255 $linei1 = $line2; | |
256 } | |
257 else{ | |
258 $linei2 = $line2; | |
259 } | |
260 | |
261 if ($line3_stat eq "out"){ | |
262 $lineo = $line3; | |
263 } | |
264 elsif ($line3_stat eq "in1"){ | |
265 $linei1 = $line3; | |
266 } | |
267 else{ | |
268 $linei2 = $line3; | |
269 } | |
270 | |
271 $linei1=~ s/^\s*//; | |
272 $linei1 =~ s/\s+/\t/g; | |
273 @linei1 = split(/\t/, $linei1); | |
274 $end1 =($linei1[2]+$linei1[3]-1); | |
275 $seq1 = $linei1[1].":".$linei1[3]; | |
276 $ingroup1 = (split(/\./, $seq1))[0]; | |
277 $start1 = $linei1[2]; | |
278 $align_length1 = $linei1[3]; | |
279 $orient1 = $linei1[4]; | |
280 $seq_length1 = $linei1[5]; | |
281 $sequence1 = $linei1[6]; | |
282 $test1 = length($sequence1); | |
283 my $total_length1 = $test1+$start1; | |
284 my @array1 = ($start1,$end1,$orient1,$seq_length1); | |
285 ($start1_plus, $end1_plus) = convert_coords(@array1); | |
286 | |
287 $linei2=~ s/^\s*//; | |
288 $linei2 =~ s/\s+/\t/g; | |
289 @linei2 = split(/\t/, $linei2); | |
290 $end2 =($linei2[2]+$linei2[3]-1); | |
291 $seq2 = $linei2[1].":".$linei2[3]; | |
292 $ingroup2 = (split(/\./, $seq2))[0]; | |
293 $start2 = $linei2[2]; | |
294 $align_length2 = $linei2[3]; | |
295 $orient2 = $linei2[4]; | |
296 $seq_length2 = $linei2[5]; | |
297 $sequence2 = $linei2[6]; | |
298 my $test2 = length($sequence2); | |
299 my $total_length2 = $test2+$start2; | |
300 my @array2 = ($start2,$end2,$orient2,$seq_length2); | |
301 ($start2_plus, $end2_plus) = convert_coords(@array2); | |
302 | |
303 $lineo=~ s/^\s*//; | |
304 $lineo =~ s/\s+/\t/g; | |
305 @lineo = split(/\t/, $lineo); | |
306 $end3 =($lineo[2]+$lineo[3]-1); | |
307 $seq3 = $lineo[1].":".$lineo[3]; | |
308 $start3 = $lineo[2]; | |
309 $align_length3 = $lineo[3]; | |
310 $orient3 = $lineo[4]; | |
311 $seq_length3 = $lineo[5]; | |
312 $sequence3 = $lineo[6]; | |
313 my $test3 = length($sequence3); | |
314 my $total_length3 = $test3+$start3; | |
315 my @array3 = ($start3,$end3,$orient3,$seq_length3); | |
316 ($start3_plus, $end3_plus) = convert_coords(@array3); | |
317 | |
318 #print "# l1 = $ingroup1\n"; | |
319 #print "# l2 = $ingroup2\n"; | |
320 #print "# l3 = $outgroup\n"; | |
321 | |
322 my $ABC = ""; | |
323 my $coord1 = my $coord2 = my $coord3 = 0; | |
324 $coord1 = $start1_plus; | |
325 $coord2 = $start2_plus; | |
326 $coord3 = $start3_plus; | |
327 | |
328 for (my $position = 0; $position < $test1; $position++) { | |
329 my $indelType = ""; | |
330 my $indel_line = ""; | |
331 # seq1 deletes | |
332 if ((substr($sequence1,$position,1) eq "-") | |
333 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
334 && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ | |
335 $ABC = join("",($ABC,"X")); | |
336 my @s = split(/:/, $seq1); | |
337 $indelType = $s[0]."_delete"; | |
338 | |
339 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
340 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
341 push (@indels,$indel_line); | |
342 push (@seq1_delete,$indel_line); | |
343 $coord2++; $coord3++; | |
344 } | |
345 # seq2 deletes | |
346 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
347 && (substr($sequence2,$position,1) eq "-") | |
348 && (substr($sequence3,$position,1) !~ m/[-*\$?^]/)){ | |
349 $ABC = join("",($ABC,"Y")); | |
350 my @s = split(/:/, $seq2); | |
351 $indelType = $s[0]."_delete"; | |
352 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
353 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
354 push (@indels,$indel_line); | |
355 push (@seq2_delete,$indel_line); | |
356 $coord1++; | |
357 $coord3++; | |
358 | |
359 } | |
360 # seq1 inserts | |
361 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
362 && (substr($sequence2,$position,1) eq "-") | |
363 && (substr($sequence3,$position,1) eq "-")){ | |
364 $ABC = join("",($ABC,"Z")); | |
365 my @s = split(/:/, $seq1); | |
366 $indelType = $s[0]."_insert"; | |
367 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
368 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
369 push (@indels,$indel_line); | |
370 push (@seq1_insert,$indel_line); | |
371 $coord1++; | |
372 } | |
373 # seq2 inserts | |
374 elsif ((substr($sequence1,$position,1) eq "-") | |
375 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
376 && (substr($sequence3,$position,1) eq "-")){ | |
377 $ABC = join("",($ABC,"W")); | |
378 my @s = split(/:/, $seq2); | |
379 $indelType = $s[0]."_insert"; | |
380 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
381 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
382 push (@indels,$indel_line); | |
383 push (@seq2_insert,$indel_line); | |
384 $coord2++; | |
385 } | |
386 # seq3 deletes | |
387 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
388 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
389 && (substr($sequence3,$position,1) eq "-")){ | |
390 $ABC = join("",($ABC,"S")); | |
391 my @s = split(/:/, $seq3); | |
392 $indelType = $s[0]."_delete"; | |
393 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
394 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
395 push (@indels,$indel_line); | |
396 push (@seq3_delete,$indel_line); | |
397 $coord1++; $coord2++; | |
398 } | |
399 # seq3 inserts | |
400 elsif ((substr($sequence1,$position,1) eq "-") | |
401 && (substr($sequence2,$position,1) eq "-") | |
402 && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ | |
403 $ABC = join("",($ABC,"T")); | |
404 my @s = split(/:/, $seq3); | |
405 $indelType = $s[0]."_insert"; | |
406 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
407 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
408 push (@indels,$indel_line); | |
409 push (@seq3_insert,$indel_line); | |
410 $coord3++; | |
411 }else{ | |
412 $ABC = join("",($ABC,"N")); | |
413 $coord1++; $coord2++; $coord3++; | |
414 } | |
415 | |
416 } | |
417 @array_return=($seq1,$seq2,$seq3,$ABC); | |
418 return (@array_return); | |
419 | |
420 } | |
421 # ignore pairwise cases for now, just count the number of blocks | |
422 elsif (scalar(@sequences) == 2){ | |
423 my $ABC = ""; | |
424 my $coord1 = my $coord2 = my $coord3 = 0; | |
425 $count3++; | |
426 | |
427 $line1 = $sequences[0]; | |
428 $line2 = $sequences[1]; | |
429 chomp ($line1); | |
430 chomp ($line2); | |
431 | |
432 if ($line2 !~ m/$ingroup2/){ | |
433 $count4++; | |
434 } | |
435 } | |
436 } | |
437 | |
438 | |
439 sub get_indels_lengths{ | |
440 my (@array) = @_; | |
441 if (scalar(@array) == 4){ | |
442 my $seq1 = $array[0]; my $seq2 = $array[1]; my $seq3 = $array[2]; my $ABC = $array[3]; | |
443 | |
444 while ($ABC =~ m/(X+)/g) { | |
445 push (@seq1_delete_lengths,length($1)); | |
446 $count_seq1delete++; | |
447 } | |
448 | |
449 while ($ABC =~ m/(Y+)/g) { | |
450 push (@seq2_delete_lengths,length($1)); | |
451 $count_seq2delete++; | |
452 } | |
453 while ($ABC =~ m/(S+)/g) { | |
454 push (@seq3_delete_lengths,length($1)); | |
455 $count_seq3delete++; | |
456 } | |
457 while ($ABC =~ m/(Z+)/g) { | |
458 push (@seq1_insert_lengths,length($1)); | |
459 $count_seq1insert++; | |
460 } | |
461 while ($ABC =~ m/(W+)/g) { | |
462 push(@seq2_insert_lengths,length($1)); | |
463 $count_seq2insert++; | |
464 } | |
465 while ($ABC =~ m/(T+)/g) { | |
466 push (@seq3_insert_lengths,length($1)); | |
467 $count_seq3insert++; | |
468 } | |
469 } | |
470 elsif (scalar(@array) == 0){ | |
471 next; | |
472 } | |
473 | |
474 } | |
475 # convert to coordinates to + strand if align orient = - | |
476 sub convert_coords{ | |
477 my (@array) = @_; | |
478 my $s = $array[0]; | |
479 my $e = $array[1]; | |
480 my $o = $array[2]; | |
481 my $l = $array[3]; | |
482 my $start_plus = my $end_plus = 0; | |
483 | |
484 if ($o eq "-"){ | |
485 $start_plus = ($l - $e); | |
486 $end_plus = ($l - $s); | |
487 }elsif ($o eq "+"){ | |
488 $start_plus = $s; | |
489 $end_plus = $e; | |
490 } | |
491 | |
492 return ($start_plus, $end_plus); | |
493 } | |
494 | |
495 # get first line only for each event | |
496 sub get_starts_only{ | |
497 my (@test) = @_; | |
498 my $seq1 = my $seq2 = my $seq3 = my $indelType = my $old_seq1 = my $old_seq2 = my $old_seq3 = my $old_indelType = my $old_line = ""; | |
499 my $coord1 = my $coord2 = my $coord3 = my $old_coord1 = my $old_coord2 = my $old_coord3 = 0; | |
500 | |
501 my @matches = (); | |
502 my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); | |
503 | |
504 | |
505 foreach my $line (@test){ | |
506 chomp($line); | |
507 $line =~ s/^\s*//; | |
508 $line =~ s/\s+/\t/g; | |
509 my @line1 = split(/\t/, $line); | |
510 $seq1 = $line1[1]; | |
511 $coord1 = $line1[2]; | |
512 $seq2 = $line1[4]; | |
513 $coord2 = $line1[5]; | |
514 $seq3 = $line1[7]; | |
515 $coord3 = $line1[8]; | |
516 $indelType = $line1[10]; | |
517 if ($indelType =~ m/$ingroup1/ && $indelType =~ m/insert/){ | |
518 if ($coord1 != $old_coord1+1 || ($coord2 != $old_coord2 || $coord3 != $old_coord3)){ | |
519 $start1++; | |
520 push (@seq1_insert_startOnly,$line); | |
521 } | |
522 } | |
523 elsif ($indelType =~ m/$ingroup1/ && $indelType =~ m/delete/){ | |
524 if ($coord1 != $old_coord1 || ($coord2 != $old_coord2+1 || $coord3 != $old_coord3+1)){ | |
525 $start2++; | |
526 push(@seq1_delete_startOnly,$line); | |
527 } | |
528 } | |
529 elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/insert/){ | |
530 if ($coord2 != $old_coord2+1 || ($coord1 != $old_coord1 || $coord3 != $old_coord3)){ | |
531 $start3++; | |
532 push(@seq2_insert_startOnly,$line); | |
533 } | |
534 } | |
535 elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/delete/){ | |
536 if ($coord2 != $old_coord2 || ($coord1 != $old_coord1+1 || $coord3 != $old_coord3+1)){ | |
537 $start4++; | |
538 push(@seq2_delete_startOnly,$line); | |
539 } | |
540 } | |
541 elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/insert/){ | |
542 if ($coord3 != $old_coord3+1 || ($coord1 != $old_coord1 || $coord2 != $old_coord2)){ | |
543 $start5++; | |
544 push(@seq3_insert_startOnly,$line); | |
545 } | |
546 } | |
547 elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/delete/){ | |
548 if ($coord3 != $old_coord3 || ($coord1 != $old_coord1+1 ||$coord2 != $old_coord2+1)){ | |
549 $start6++; | |
550 push(@seq3_delete_startOnly,$line); | |
551 } | |
552 } | |
553 $old_indelType = $indelType; | |
554 $old_seq1 = $seq1; | |
555 $old_coord1 = $coord1; | |
556 $old_seq2 = $seq2; | |
557 $old_coord2 = $coord2; | |
558 $old_seq3 = $seq3; | |
559 $old_coord3 = $coord3; | |
560 $old_line = $line; | |
561 } | |
562 } | |
563 # append lengths to each event start line | |
564 my $counter1; my $counter2; my $counter3; my $counter4; my $counter5; my $counter6; | |
565 my @final1 = my @final2 = my @final3 = my @final4 = my @final5 = my @final6 = (); | |
566 my $final_line1 = my $final_line2 = my $final_line3 = my $final_line4 = my $final_line5 = my $final_line6 = ""; | |
567 | |
568 | |
569 for ($counter1 = 0; $counter1 < @seq3_insert_startOnly; $counter1++){ | |
570 $final_line1 = join("\t",($seq3_insert_startOnly[$counter1],$seq3_insert_lengths[$counter1])); | |
571 push (@final1,$final_line1); | |
572 } | |
573 | |
574 for ($counter2 = 0; $counter2 < @seq3_delete_startOnly; $counter2++){ | |
575 $final_line2 = join("\t",($seq3_delete_startOnly[$counter2],$seq3_delete_lengths[$counter2])); | |
576 push(@final2,$final_line2); | |
577 } | |
578 | |
579 for ($counter3 = 0; $counter3 < @seq2_insert_startOnly; $counter3++){ | |
580 $final_line3 = join("\t",($seq2_insert_startOnly[$counter3],$seq2_insert_lengths[$counter3])); | |
581 push(@final3,$final_line3); | |
582 } | |
583 | |
584 for ($counter4 = 0; $counter4 < @seq2_delete_startOnly; $counter4++){ | |
585 $final_line4 = join("\t",($seq2_delete_startOnly[$counter4],$seq2_delete_lengths[$counter4])); | |
586 push(@final4,$final_line4); | |
587 } | |
588 | |
589 for ($counter5 = 0; $counter5 < @seq1_insert_startOnly; $counter5++){ | |
590 $final_line5 = join("\t",($seq1_insert_startOnly[$counter5],$seq1_insert_lengths[$counter5])); | |
591 push(@final5,$final_line5); | |
592 } | |
593 | |
594 for ($counter6 = 0; $counter6 < @seq1_delete_startOnly; $counter6++){ | |
595 $final_line6 = join("\t",($seq1_delete_startOnly[$counter6],$seq1_delete_lengths[$counter6])); | |
596 push(@final6,$final_line6); | |
597 } | |
598 | |
599 # format final output | |
600 # # if inserts, increase coords for the sequence inserted, other sequences give coords for 5' and 3' bases flanking the gap | |
601 # # for deletes, increase coords for other 2 sequences and the one deleted give coords for 5' and 3' bases flanking the gap | |
602 | |
603 get_final_format(@final5); | |
604 get_final_format(@final6); | |
605 get_final_format(@final3); | |
606 get_final_format(@final4); | |
607 get_final_format(@final1); | |
608 get_final_format(@final2); | |
609 | |
610 sub get_final_format{ | |
611 my (@final) = @_; | |
612 foreach (@final){ | |
613 my $event_line = $_; | |
614 my @events = split(/\t/, $event_line); | |
615 my $event_type = $events[10]; | |
616 my @name_align1 = split(/:/, $events[1]); | |
617 my @name_align2 = split(/:/, $events[4]); | |
618 my @name_align3 = split(/:/, $events[7]); | |
619 my $seq1_event_start = my $seq1_event_end = my $seq2_event_start = my $seq2_event_end = my $seq3_event_start = my $seq3_event_end = 0; | |
620 my $final_event_line = ""; | |
621 # seq1_insert | |
622 if ($event_type =~ m/$ingroup1/ && $event_type =~ m/insert/){ | |
623 # only increase coord for human | |
624 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
625 $seq1_event_start = ($events[2]); | |
626 $seq1_event_end = ($events[2]+$events[11]-1); | |
627 $seq2_event_start = ($events[5]-1); | |
628 $seq2_event_end = ($events[5]); | |
629 $seq3_event_start = ($events[8]-1); | |
630 $seq3_event_end = ($events[8]); | |
631 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
632 } | |
633 # seq1_delete | |
634 elsif ($event_type =~ m/$ingroup1/ && $event_type =~ m/delete/){ | |
635 # only increase coords for seq2 and seq3 | |
636 # remember for seq1, the gap spans (coord - 1) --> coord | |
637 $seq1_event_start = ($events[2]-1); | |
638 $seq1_event_end = ($events[2]); | |
639 $seq2_event_start = ($events[5]); | |
640 $seq2_event_end = ($events[5]+$events[11]-1); | |
641 $seq3_event_start = ($events[8]); | |
642 $seq3_event_end = ($events[8]+$events[11]-1); | |
643 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
644 } | |
645 # seq2_insert | |
646 elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){ | |
647 # only increase coords for seq2 | |
648 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
649 $seq1_event_start = ($events[2]-1); | |
650 $seq1_event_end = ($events[2]); | |
651 $seq2_event_start = ($events[5]); | |
652 $seq2_event_end = ($events[5]+$events[11]-1); | |
653 $seq3_event_start = ($events[8]-1); | |
654 $seq3_event_end = ($events[8]); | |
655 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
656 } | |
657 # seq2_delete | |
658 elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){ | |
659 # only increase coords for seq1 and seq3 | |
660 # remember for seq2, the gap spans (coord - 1) --> coord | |
661 $seq1_event_start = ($events[2]); | |
662 $seq1_event_end = ($events[2]+$events[11]-1); | |
663 $seq2_event_start = ($events[5]-1); | |
664 $seq2_event_end = ($events[5]); | |
665 $seq3_event_start = ($events[8]); | |
666 $seq3_event_end = ($events[8]+$events[11]-1); | |
667 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
668 } | |
669 # start testing w/seq3_insert | |
670 elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){ | |
671 # only increase coord for rheMac | |
672 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
673 $seq1_event_start = ($events[2]-1); | |
674 $seq1_event_end = ($events[2]); | |
675 $seq2_event_start = ($events[5]-1); | |
676 $seq2_event_end = ($events[5]); | |
677 $seq3_event_start = ($events[8]); | |
678 $seq3_event_end = ($events[8]+$events[11]-1); | |
679 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
680 } | |
681 # seq3_delete | |
682 elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){ | |
683 # only increase coords for seq1 and seq2 | |
684 # remember for seq3, the gap spans (coord - 1) --> coord | |
685 $seq1_event_start = ($events[2]); | |
686 $seq1_event_end = ($events[2]+$events[11]-1); | |
687 $seq2_event_start = ($events[5]); | |
688 $seq2_event_end = ($events[5]+$events[11]-1); | |
689 $seq3_event_start = ($events[8]-1); | |
690 $seq3_event_end = ($events[8]); | |
691 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
692 | |
693 } | |
694 | |
695 print OFILE2 "$final_event_line\n"; | |
696 } | |
697 } | |
698 close OFILE2; |