Mercurial > repos > lxue > ageseq
comparison AGEseq_web.pl @ 0:a9c5e846dd76 draft
Perl code
author | lxue |
---|---|
date | Fri, 15 May 2015 16:37:02 -0400 |
parents | |
children | 449c8cf8fa3f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a9c5e846dd76 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 my $file_design = $ARGV[0]; | |
4 my $read_file = $ARGV[1]; | |
5 my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %) | |
6 my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0 | |
7 my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20 | |
8 my $indel_report = $ARGV[5]; # report top xx records with indel, default 50 | |
9 my $final_out = $ARGV[6]; | |
10 | |
11 #my $blat = ''; # working directory or PATH | |
12 my $blat = '/usr/local/bin/blat'; # Your prefered full location | |
13 | |
14 my $rand = rand 1000000; | |
15 ## setting for reports | |
16 if(not defined ($mismatch_cutoff )){ | |
17 $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %) | |
18 } | |
19 | |
20 if(not defined ($min_cutoff )){ | |
21 $min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0 | |
22 } | |
23 | |
24 if(not defined ($wt_like_report )){ | |
25 $wt_like_report = 20 ; # report top xx WT like records, default 20 | |
26 } | |
27 | |
28 if(not defined ($indel_report )){ | |
29 $indel_report = 50 ; # report top xx records with indel, default 50 | |
30 } | |
31 | |
32 | |
33 my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1 | |
34 | |
35 ############################################################### | |
36 # default setting | |
37 | |
38 my $remove = 'rm'; | |
39 my $osname = $^O; | |
40 | |
41 my $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa'; | |
42 my @psl_file_data = (); | |
43 | |
44 if(not defined ($file_design )){ | |
45 die "Design file is needed\n"; | |
46 } | |
47 | |
48 if(not defined ($final_out )){ | |
49 $final_out = 'AGE_output.txt'; | |
50 } | |
51 ########################################################## | |
52 ### step 1 load design file | |
53 | |
54 open DESIGN,"<$file_design" or die "File $file_design not found error here\n"; | |
55 open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\n"; | |
56 | |
57 my $num = 0; | |
58 my %design_hash = (); | |
59 | |
60 | |
61 while (my $line = <DESIGN>) { | |
62 $num ++; | |
63 if($num == 1){ | |
64 next; | |
65 } | |
66 if($line !~ m/\w/){ | |
67 next; | |
68 } | |
69 | |
70 chomp $line; | |
71 my @temp = split(/\t/,$line); | |
72 my $id = $temp[0]; | |
73 my $seq = $temp[1]; | |
74 $seq =~ s/\s+//g; | |
75 $seq =~ m/([^\r\n]+)/; | |
76 $seq = $1; | |
77 $id =~ s/\s+/\_/g; | |
78 | |
79 $design_hash{$id} = $seq; | |
80 print DESIGNFAS ">$id\n$seq\n"; | |
81 } | |
82 close(DESIGNFAS); | |
83 | |
84 | |
85 ####################################################### | |
86 ############## step 2 - load read files ############# | |
87 | |
88 my %total_read_num = (); | |
89 my $num_c = 0; | |
90 my @fasta_files = (); | |
91 | |
92 my $file_in = $read_file; | |
93 my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa'; | |
94 | |
95 | |
96 my $type = check_file_type($file_in ) ; | |
97 | |
98 if($type eq 'fq'){ | |
99 # fastq | |
100 # print "this is fastq file\n"; | |
101 my $total_num = &fastq2fasta($file_in,$fasta_out); | |
102 $total_read_num{$fasta_out} = $total_num; | |
103 } | |
104 elsif($type eq 'fa'){ | |
105 # fasta | |
106 # print "this is fasta file\n"; | |
107 my $total_num = &fasta2fasta($file_in,$fasta_out); | |
108 # load number | |
109 $total_read_num{$fasta_out} = $total_num; | |
110 } | |
111 else{ | |
112 die "Please check the format of read file\n"; | |
113 } | |
114 | |
115 | |
116 ######################################################## | |
117 ## Here will put step3 to step 5 into one loop | |
118 | |
119 | |
120 open (REPORT,">$final_out") || die "cannot write\n"; | |
121 | |
122 | |
123 my $file_blat_in = $fasta_out; | |
124 | |
125 ############ step 3 - blat ########################### | |
126 my $blat_out = '/tmp/fasta'.$rand.'_blat_crispr.psl'; | |
127 | |
128 my $command_line = $blat.' '.' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '. $design_fas.' '.$blat_out .' >/tmp/blat.txt'; | |
129 | |
130 system("$command_line"); | |
131 | |
132 #print "blat job $file_blat_in is done\n"; | |
133 ########## step 4 - convert psl -> bed ############## | |
134 | |
135 my $bed_hash_address = &psl2bed ($blat_out ); ## address of one hash | |
136 | |
137 ########## step 5 - get sequences number from bed ############## | |
138 | |
139 my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash | |
140 | |
141 | |
142 ## remove files | |
143 | |
144 if($remove_files == 1){ | |
145 $command_line = $remove.' '.$file_blat_in; | |
146 system("$command_line"); | |
147 | |
148 $command_line = $remove.' '.$blat_out ; | |
149 system("$command_line"); | |
150 | |
151 my $command_line2 = $remove.' '.$design_fas ; | |
152 system("$command_line2"); | |
153 } | |
154 | |
155 | |
156 ######################################################################################### | |
157 ## functions | |
158 | |
159 | |
160 sub MAIN{ | |
161 my $fas_file_in = shift @_; | |
162 my $bed_file_address_in = shift @_; | |
163 | |
164 my %bed_hash_in = %{$bed_file_address_in}; | |
165 | |
166 | |
167 ######################################################### | |
168 # read convereted reads fasta file | |
169 | |
170 open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in "; | |
171 | |
172 my %query_with_seq2num = (); | |
173 my %seq2alignment = (); | |
174 my %seq_part2assignment = (); | |
175 | |
176 while (my $lineIn =<FAS>){ | |
177 chomp $lineIn; | |
178 if($lineIn =~ m/^\>/){ | |
179 my $id = substr $lineIn,1; | |
180 | |
181 my @numbers = split(/\_/, $id); | |
182 my $total_num = $numbers[-1]; | |
183 | |
184 $lineIn = <FAS>; | |
185 chomp $lineIn; | |
186 my $seq = $lineIn; | |
187 | |
188 if(exists $bed_hash_in{$id}){ | |
189 # $psl_hash{$read_target}{$query} # format of hash | |
190 ## asign read to target reference | |
191 | |
192 ########################################################## | |
193 my @read_asignment = (); | |
194 ## loop for best | |
195 for my $read_key (keys %{$bed_hash_in{$id}}){ | |
196 ### get annotation from array | |
197 ## Q: design T: read | |
198 my @get_array = @{$bed_hash_in{$id}{$read_key}}; | |
199 my ($strand, $q_name,$q_size,$q_start,$q_end, | |
200 $t_name,$t_size,$t_start,$t_end, | |
201 $block_num,$block_size,$q_start_s,$t_start_s) = @get_array ; | |
202 | |
203 my $seq_out = substr $seq,$t_start,($t_end-$t_start); | |
204 | |
205 if($strand eq '-'){ | |
206 $seq_out = &rev_com ($seq_out); | |
207 } | |
208 ## get orignal | |
209 my $ref_ori_seq = ''; | |
210 if(exists $design_hash{$q_name}){ | |
211 $ref_ori_seq = $design_hash{$q_name}; | |
212 } | |
213 | |
214 | |
215 my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq); | |
216 # target # part of read | |
217 my ($align_ref,$align_read,$note) = split(/\t/, $edit_note); | |
218 | |
219 my $len_ori = length($ref_ori_seq); | |
220 my @align_1 = split(//, $align_ref); | |
221 my @align_2 = split(//, $align_read); | |
222 my $site_snp = 0; | |
223 my @site_snp = (); | |
224 my $iden_num = 0; | |
225 | |
226 for my $a1(@align_1){ | |
227 my $a2 = shift @align_2; | |
228 if($a1 ne '-' and $a2 ne '-'){ | |
229 if($a1 ne $a2){ | |
230 push @site_snp, $site_snp; | |
231 } | |
232 } | |
233 | |
234 if($a1 eq $a2){ | |
235 $iden_num++; | |
236 } | |
237 | |
238 $site_snp++; | |
239 } | |
240 $iden_num = $iden_num-2; | |
241 | |
242 ## filter on SNP number | |
243 my $snp_num_for_filter = @site_snp+0; | |
244 | |
245 my $dis_max = 0; | |
246 my $dis_current = 0; | |
247 my @continuous_site_temp = (); | |
248 my @continuous_site = (); | |
249 my @site_snp_checking = @site_snp; | |
250 my $start = shift @site_snp_checking ; | |
251 for my $site_checking (@site_snp_checking){ | |
252 my $dis_in = $site_checking -$start; | |
253 if($dis_in <4){ | |
254 $dis_current++; | |
255 push @continuous_site_temp,$site_checking ; | |
256 if($dis_current > $dis_max){ | |
257 $dis_max = $dis_current; | |
258 @continuous_site = @continuous_site_temp; | |
259 } | |
260 } | |
261 else{ | |
262 $dis_current = 0; | |
263 @continuous_site_temp = (); | |
264 | |
265 } | |
266 | |
267 $start = $site_checking; | |
268 } | |
269 | |
270 | |
271 if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering | |
272 if($dis_max <5){ | |
273 next; | |
274 } | |
275 } | |
276 | |
277 | |
278 if($dis_max>=5){ | |
279 @site_snp = (); | |
280 my @align_x = split(//, $align_read); | |
281 for my $iii(@continuous_site ){ | |
282 $align_x[$iii] = 'N'; | |
283 } | |
284 $align_read = join('',@align_x); | |
285 $note = 'strange editing'; | |
286 my @ttt = ($align_ref,$align_read,$note) ; | |
287 | |
288 $edit_note = join("\t", @ttt ); | |
289 } | |
290 | |
291 | |
292 | |
293 | |
294 my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array); | |
295 push @read_asignment,[@ooo]; | |
296 }## for loop for best hit of each fasta reads | |
297 | |
298 ## get the best hits | |
299 | |
300 if(@read_asignment<1){ | |
301 next; | |
302 } | |
303 | |
304 ## assign the first hit | |
305 @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ; | |
306 my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]}; | |
307 | |
308 my @snp_filtering = @{$array_snp_addr}; | |
309 | |
310 if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){ | |
311 next; | |
312 } | |
313 | |
314 | |
315 ## after asignment | |
316 | |
317 ## check whether seq_out is assigned earlier | |
318 ## $seq_out has been asigned to one record | |
319 ## just assign reads to that one althogh it may be get one new assignment | |
320 if( exists $seq_part2assignment{$seq_out}){ | |
321 my $first_assignment = $seq_part2assignment{$seq_out}; | |
322 $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num; | |
323 next; | |
324 } | |
325 | |
326 | |
327 # number for part of reads | |
328 my ($strand, $q_name,$q_size,$q_start,$q_end, | |
329 $t_name,$t_size,$t_start,$t_end, | |
330 $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr}; | |
331 | |
332 # this the first asignment | |
333 if(not exists $seq2alignment{$seq_out}){ | |
334 my @out = ($ref_ori_seq,$edit_note,$array_snp_addr); | |
335 $seq2alignment{$seq_out} = \@out; | |
336 | |
337 } | |
338 # assigned | |
339 if(not exists $seq_part2assignment{$seq_out}) { | |
340 $seq_part2assignment{$seq_out} = $q_name; | |
341 $query_with_seq2num{$q_name}{$seq_out} = $total_num; | |
342 } | |
343 | |
344 | |
345 | |
346 }# exists bed file | |
347 }# lineIN | |
348 }# while file | |
349 close(FAS); | |
350 | |
351 ### summary data for output | |
352 my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment); | |
353 | |
354 ### write output | |
355 | |
356 &write_buffer_out($hash_addr1, $hash_addr2); | |
357 | |
358 } # Main function | |
359 | |
360 | |
361 | |
362 | |
363 sub get_out_buffer{ | |
364 my $fas_file_in = $_[0]; | |
365 my %query_with_seq2num_in = %{$_[1]}; | |
366 my %seq2alignment_in = %{$_[2]}; | |
367 | |
368 my %buffer_out = (); | |
369 my %buffer_num = (); | |
370 | |
371 my $total_non_redun = 0; | |
372 ## push output in buffer before printing | |
373 my $test_case_num = $total_read_num{$fas_file_in}; | |
374 | |
375 for my $ref_name (sort keys %query_with_seq2num_in){ | |
376 my %in_hash = %{$query_with_seq2num_in{$ref_name}}; | |
377 | |
378 ## each group | |
379 my $report_wt_count = 0; | |
380 my $report_indel_count = 0; | |
381 my $other_num = 0; | |
382 my $sub_hit_num = 0; | |
383 my $indel_hit_num = 0; | |
384 | |
385 | |
386 my @other_record = (); | |
387 ### loop for records | |
388 for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){ | |
389 | |
390 my $hitnum = $in_hash{$part_of_read}; | |
391 | |
392 if($hitnum < $min_cutoff) { | |
393 next; | |
394 } | |
395 | |
396 if($hitnum/$test_case_num < 0.00005) { | |
397 next; | |
398 } | |
399 | |
400 ## get Editing Note for output only | |
401 | |
402 if(exists $seq2alignment_in{$part_of_read}){ | |
403 my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}}; | |
404 my ($align_ref,$align_read,$note) = split(/\t/,$edit_note); | |
405 my @snp_out = @{$array_snp_addr}; | |
406 my $snp_num = @snp_out +0; | |
407 # deep sequencing SNP | |
408 if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){ | |
409 next; | |
410 } | |
411 | |
412 my $snp_note = ''; | |
413 if($snp_num>0 ){ | |
414 $snp_note = $snp_num.' SNP('.join(',',@snp_out).')'; | |
415 } | |
416 | |
417 $sub_hit_num = $sub_hit_num+$hitnum; | |
418 $total_non_redun = $total_non_redun + $hitnum; | |
419 | |
420 | |
421 # output here | |
422 if($ref_ori_seq ne $design_hash{$ref_name} ){ | |
423 print "error in read assingment \n"; | |
424 } | |
425 | |
426 | |
427 my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note); | |
428 ## indel | |
429 if($edit_note !~ m/no.+indel/ ){ | |
430 | |
431 if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){ | |
432 next; # filter strange case in deep sequencing with few reads | |
433 } | |
434 | |
435 | |
436 $report_indel_count ++; | |
437 # total indel | |
438 $indel_hit_num = $indel_hit_num + $hitnum; | |
439 | |
440 if($report_indel_count <= $indel_report ){ | |
441 #print REPORT join("\t",@out_line),"\n"; | |
442 ## in buffer | |
443 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ | |
444 my @bbb = (); | |
445 push @bbb, [@out_line]; | |
446 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; | |
447 } | |
448 else{ | |
449 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line]; | |
450 } | |
451 } | |
452 else{ | |
453 $other_num = $other_num+$hitnum; | |
454 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-'); | |
455 } | |
456 | |
457 } | |
458 else{ | |
459 $report_wt_count ++; | |
460 if($report_wt_count <= $wt_like_report ){ | |
461 #print REPORT join("\t",@out_line),"\n"; | |
462 # in buffer | |
463 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ | |
464 my @bbb = (); | |
465 push @bbb, [@out_line]; | |
466 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; | |
467 } | |
468 else{ | |
469 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line]; | |
470 } | |
471 } | |
472 else{ | |
473 $other_num = $other_num+$hitnum; | |
474 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-'); | |
475 } | |
476 | |
477 }# wt like report | |
478 | |
479 } # if exists | |
480 | |
481 }# for part of reads | |
482 | |
483 ## other report | |
484 if($other_num>0){ | |
485 $other_record[-5] = $other_num; | |
486 #print REPORT join("\t",@other_record),"\n"; | |
487 | |
488 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ | |
489 my @bbb = (); | |
490 push @bbb, [@other_record]; | |
491 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; | |
492 } | |
493 else{ | |
494 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record]; | |
495 } | |
496 } | |
497 | |
498 #my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num); | |
499 ## total read number | |
500 my $total_num = $total_read_num{$fas_file_in}; | |
501 | |
502 # 0 1 2 | |
503 # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num); | |
504 my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num); | |
505 | |
506 $buffer_out{$fas_file_in}{$ref_name}{'sum'} = \@summary_record ; | |
507 $buffer_num{$fas_file_in}{'total'} = $total_num ; | |
508 $buffer_num{$fas_file_in}{'sub'} = $total_non_redun ; | |
509 #print REPORT join("\t",@summary_record),"\n"; | |
510 | |
511 }# for ref_seq | |
512 | |
513 | |
514 return (\%buffer_out,\%buffer_num); | |
515 } # sub | |
516 | |
517 | |
518 ### write the output | |
519 | |
520 sub write_buffer_out{ | |
521 my %hash_out = %{$_[0]}; | |
522 my %hash_out_num = %{$_[1]}; | |
523 my $print_tracking = 0; | |
524 for my $fas_file_in (sort keys %hash_out){ | |
525 | |
526 my $total_num = $hash_out_num{$fas_file_in}{'total'} ; | |
527 my $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ; | |
528 | |
529 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){ | |
530 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}}; | |
531 | |
532 for (@data){ | |
533 $print_tracking++; | |
534 if($print_tracking==1){ | |
535 print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t", | |
536 "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n"; | |
537 } | |
538 my @ooo = @{$_}; | |
539 $ooo[0] = 'Input1'; | |
540 print REPORT join("\t", @ooo),"\n"; | |
541 } | |
542 | |
543 # sum | |
544 my @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ; | |
545 | |
546 | |
547 my @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun); | |
548 | |
549 print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n"; | |
550 | |
551 } | |
552 | |
553 | |
554 ##### print summary region | |
555 print REPORT "Summary \t -----------------------------------------------------\n"; | |
556 print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n"; | |
557 | |
558 | |
559 | |
560 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){ | |
561 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}}; | |
562 | |
563 my $wt_pair = ''."\t".''; | |
564 my $indel_pair = ''."\t".''; | |
565 my $indel_out = ''; | |
566 my $i = 0; | |
567 | |
568 for (@data){ | |
569 $i++; | |
570 my @in= @{$_}; | |
571 | |
572 | |
573 if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){ | |
574 if($indel_pair eq ''."\t".''){ | |
575 $indel_pair = $in[5]."\t".$in[6]; | |
576 my @indel_temp = split(/\s+/,$in[7]); | |
577 | |
578 my @ooo_indel = (); | |
579 for my $ooo_in (@indel_temp ){ | |
580 if($ooo_in =~ m/I(\d+)/){ | |
581 push @ooo_indel, '+'.$1; | |
582 } | |
583 if($ooo_in =~ m/D(\d+)/){ | |
584 push @ooo_indel, '-'.$1; | |
585 } | |
586 if($ooo_in =~ m/strange/){ | |
587 push @ooo_indel, 'strange editing'; | |
588 } | |
589 } | |
590 $indel_out = 'Indel:'.join(',', @ooo_indel); | |
591 } | |
592 } | |
593 else{ | |
594 # no indel | |
595 if($wt_pair eq ''."\t".''){ | |
596 $wt_pair = $in[5]."\t".$in[6]; | |
597 } | |
598 } | |
599 } | |
600 | |
601 # first record | |
602 | |
603 | |
604 | |
605 # sum | |
606 my ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ; | |
607 my $sub_num ; | |
608 my $indel_num; | |
609 #print $sub_string. "sub summary\n"; | |
610 #print $indel_string ."sub summary\n"; | |
611 | |
612 | |
613 if($sub_string =~ m/\:[^\d]+(\d+)/){ | |
614 $sub_num = $1; | |
615 } | |
616 if($indel_string =~ m/\:[^\d]+(\d+)/){ | |
617 $indel_num = $1; | |
618 } | |
619 | |
620 my $rate = int(($indel_num/$sub_num)*10000)/100; | |
621 | |
622 ## indel | |
623 # two columns | |
624 my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out); | |
625 print REPORT 'Sum:',join("\t", @out_sum),"\n"; | |
626 ## wt like | |
627 my $wt_num = $sub_num - $indel_num; | |
628 my $wt_rate = 100-$rate; | |
629 my @out_sum = ("Input1",$ref_name, $wt_pair ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like"); | |
630 print REPORT 'Sum:',join("\t", @out_sum),"\n"; | |
631 | |
632 } # records | |
633 | |
634 print REPORT "\n"; | |
635 | |
636 } # file | |
637 } # sub | |
638 ################################################################### | |
639 | |
640 | |
641 | |
642 sub get_editing_note{ | |
643 my ($address_in,$query_seq, $target_seq ,$full_read) = @_; | |
644 # target: part of read | |
645 # query : original sequence in design file | |
646 | |
647 my ($strand, $q_name,$q_size,$q_start,$q_end, | |
648 $t_name,$t_size,$t_start,$t_end, | |
649 $block_num,$block_size,$q_start_s,$t_start_s) = @{$address_in}; | |
650 | |
651 my @out = (); | |
652 #print $strand ,"strand\n"; | |
653 if($strand eq '+'){ | |
654 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read); | |
655 } | |
656 else{ | |
657 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read); | |
658 @out = get_reverse(@out); | |
659 } | |
660 | |
661 $out[0]='B'.$out[0].'E'; | |
662 $out[1]='B'.$out[1].'E'; | |
663 if($out[2] !~ m/\w/){ | |
664 $out[2] = 'no_indel'; | |
665 } | |
666 return join("\t",@out); | |
667 } | |
668 | |
669 | |
670 | |
671 sub get_alignment{ | |
672 my ($blocks , $blockLengths, $qStarts , $tStarts,$query_seq, $target_seq) = @_; | |
673 my @blockSizes = split(/\,/,$blockLengths); | |
674 my @qArray = split(/\,/,$qStarts); | |
675 my @tArray = split(/\,/,$tStarts); | |
676 | |
677 # target: part of read | |
678 # query : original sequence in design file | |
679 | |
680 # print join("\t",@_),"\n"; | |
681 | |
682 ### before blocks | |
683 my $before_q = substr $query_seq, 0,$qArray[0]; | |
684 my $before_t = substr $target_seq,0,$tArray[0]; | |
685 | |
686 | |
687 ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before"); | |
688 | |
689 ### after blocks | |
690 my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1]; | |
691 my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1]; | |
692 | |
693 | |
694 ($after_q,$after_t) = treat_sequence($after_q,$after_t,"after") ; | |
695 | |
696 ### blocks | |
697 my @med_q = (); | |
698 my @med_t = (); | |
699 | |
700 my @indel_out = (); | |
701 my $out = ''; | |
702 ## first block | |
703 my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0]; | |
704 my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0]; | |
705 push @med_q,$med_q_seq; | |
706 push @med_t,$med_t_seq; | |
707 | |
708 if($blocks>1){ | |
709 for (my $i= 0;$i<($blocks-1);$i++){ | |
710 ####### interval | |
711 my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i])); | |
712 my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i])); | |
713 | |
714 ### count deletion insertion | |
715 if(length($inter_t_seq) - length($inter_q_seq)>0){ | |
716 # to fix the mismatch near deletion length($inter_q_seq) | |
717 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq)); | |
718 } | |
719 if(length($inter_q_seq)-length($inter_t_seq)>0){ | |
720 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq)); | |
721 } | |
722 push @indel_out,$out; | |
723 | |
724 ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ; | |
725 push @med_q, $inter_q_seq; | |
726 push @med_t, $inter_t_seq; | |
727 | |
728 ###### block after interval | |
729 $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1]; | |
730 $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1]; | |
731 push @med_q,$med_q_seq; | |
732 push @med_t,$med_t_seq; | |
733 } | |
734 } | |
735 | |
736 push @med_q,$after_q; | |
737 push @med_t,$after_t; | |
738 | |
739 unshift @med_q,$before_q; | |
740 unshift @med_t,$before_t; | |
741 my $q_string = join('',@med_q); | |
742 my $t_string = join('',@med_t); | |
743 | |
744 if($q_string=~m/^(\-+)/){ | |
745 my $len = length($1); | |
746 $q_string = substr $q_string,$len; | |
747 $t_string = substr $t_string, $len; | |
748 } | |
749 if($q_string=~m/(\-+)$/){ | |
750 my $len = length($1); | |
751 $q_string = substr $q_string,0,(-1)*$len; | |
752 $t_string = substr $t_string,0, (-1)*$len; | |
753 } | |
754 | |
755 | |
756 return ($q_string,$t_string,join(' ',@indel_out)); | |
757 | |
758 } | |
759 | |
760 | |
761 sub get_reverse{ | |
762 my ($q_string,$t_string,$indel_string )= @_; | |
763 $q_string = rev_com($q_string); | |
764 $t_string = rev_com($t_string); | |
765 | |
766 my $string_test = $q_string; | |
767 $string_test =~ s/\-//g; | |
768 my $len = length($string_test); | |
769 | |
770 my @indel_in = split(/\s/, $indel_string); | |
771 my @indel_out = (); | |
772 for (@indel_in){ | |
773 | |
774 if(m/^(\d+)(.)(\d+)/){ | |
775 my $ooo = ($len-$1-$3).$2.$3; | |
776 if($2 eq 'I'){ | |
777 $ooo = ($len-$1).$2.$3; | |
778 } | |
779 | |
780 unshift @indel_out,$ooo; | |
781 } | |
782 } | |
783 $indel_string = join(' ',@indel_out); | |
784 return($q_string,$t_string, $indel_string); | |
785 } | |
786 | |
787 | |
788 | |
789 | |
790 | |
791 sub treat_inter{ | |
792 my ($q,$t) = @_; | |
793 my $q_len = length($q); | |
794 my $t_len = length($t); | |
795 my $dis = abs($q_len-$t_len); | |
796 my @oooo = (); | |
797 | |
798 for(1..$dis){ | |
799 push @oooo,'-'; | |
800 } | |
801 | |
802 if($q_len-$t_len >0){ | |
803 $t = $t.join('',@oooo); | |
804 } | |
805 else{ | |
806 $q = $q.join('',@oooo); | |
807 } | |
808 return ($q,$t); | |
809 } | |
810 | |
811 | |
812 | |
813 sub treat_sequence{ | |
814 my ($q,$t,$position) = @_; | |
815 my $q_len= length($q); | |
816 my $t_len = length($t); | |
817 my $dis = abs($q_len-$t_len); | |
818 | |
819 if($dis > 0){ | |
820 my @oooo = (); | |
821 for(1..$dis){ | |
822 push @oooo,'-'; | |
823 } | |
824 if($q_len>$t_len){ | |
825 if($position eq 'before'){ | |
826 $t = join('',@oooo).$t ; | |
827 } | |
828 else{ | |
829 $t = $t.join('',@oooo) ; | |
830 } | |
831 } | |
832 else{ # q small | |
833 if($position eq 'before'){ | |
834 $q = join('',@oooo).$q ; | |
835 } | |
836 else{ | |
837 $q = $q.join('',@oooo) ; | |
838 } | |
839 } | |
840 } | |
841 return ($q,$t); | |
842 } | |
843 | |
844 | |
845 sub psl2bed { | |
846 my $psl_file = shift @_; # file in @psl_file_data | |
847 my @psl_array = (); | |
848 | |
849 | |
850 open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n"; | |
851 | |
852 ## should be correct | |
853 | |
854 for(1..5){ | |
855 my $line=<FILEINBLAT>; | |
856 ## remove head | |
857 } | |
858 ## read psl | |
859 while (<FILEINBLAT>) | |
860 { | |
861 my $lineIn = $_; | |
862 chomp $lineIn; | |
863 my @lineArr = split ("\t", $lineIn); | |
864 | |
865 my $q_size = $lineArr[10]; | |
866 my $m_size = $lineArr[0]; | |
867 | |
868 if( $m_size/$q_size > 0.20){ | |
869 push @psl_array,[@lineArr]; | |
870 } | |
871 | |
872 }# while file | |
873 close(FILEINBLAT); | |
874 ## score large to small | |
875 | |
876 #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ; | |
877 | |
878 my @out_array = (); | |
879 my $psl_line_num = 0; | |
880 my %psl_hash = (); | |
881 for (@psl_array) | |
882 { | |
883 my @lineArr = @{$_}; | |
884 $psl_line_num++; | |
885 my $read_target = $lineArr[13]; | |
886 my $query = $lineArr[9]; | |
887 for(1..8){ | |
888 shift @lineArr; | |
889 } | |
890 | |
891 if(not exists $psl_hash{$read_target}{$query}){ ## all alignment | |
892 #if(not exists $psl_hash{$read_target}){ ## only keep the best hit | |
893 $psl_hash{$read_target}{$query} = \@lineArr; | |
894 } | |
895 }# while array | |
896 #print $psl_line_num." total psl lines\n"; | |
897 return (\%psl_hash); | |
898 } | |
899 | |
900 | |
901 | |
902 | |
903 sub fasta2fasta { | |
904 my $input_file = shift @_; | |
905 my $out_file = shift @_; | |
906 | |
907 open IN,"<$input_file " or die "File $input_file not found error here\n"; | |
908 open TGT,">$out_file" or die "File $out_file not found error here\n"; | |
909 my $c = 0; | |
910 my @line = (); | |
911 my $id =''; | |
912 my $seq = ''; | |
913 my $processed = 0; | |
914 | |
915 while(<IN>){ | |
916 chomp; | |
917 if(m/^\>/){ | |
918 $processed ++; | |
919 if($processed == 1){ | |
920 my @seq_id = split(/\s+/, $_); | |
921 my $seq_id = $seq_id[0]; | |
922 $seq_id =~ s/\_//g; | |
923 $seq_id = $seq_id.'_1'; | |
924 print TGT '>'.$seq_id,"\n"; | |
925 } | |
926 else{ | |
927 $seq =~ s/\s|\n|\r//g; | |
928 print TGT $seq,"\n"; | |
929 $seq = ''; | |
930 my @seq_id = split(/\s+/, $_); | |
931 my $seq_id = $seq_id[0]; | |
932 $seq_id =~ s/\_//g; | |
933 $seq_id = $seq_id.'_1'; | |
934 print TGT '>'.$seq_id,"\n"; | |
935 } | |
936 | |
937 } | |
938 else{ | |
939 $seq = $seq.$_; | |
940 } | |
941 } | |
942 # last records | |
943 $seq =~ s/\s|\n|\r//g; | |
944 print TGT $seq,"\n"; | |
945 | |
946 close IN; | |
947 close TGT; | |
948 return $processed; | |
949 } | |
950 | |
951 | |
952 | |
953 sub fastq2fasta { | |
954 my $input_file = shift @_; | |
955 my $out_file = shift @_; | |
956 | |
957 open IN,"<$input_file " or die "File $input_file not found error here\n"; | |
958 open TGT,">$out_file" or die "File $out_file not found error here\n"; | |
959 my $c = 0; | |
960 my @line = (); | |
961 my $id =''; | |
962 my $seq = ''; | |
963 my $processed = 0; | |
964 my %read_hash = (); | |
965 | |
966 ############################################################################################## | |
967 | |
968 while(<IN>){ | |
969 chomp; | |
970 $c++; | |
971 if($c == 1){ | |
972 $processed++; | |
973 # print STDERR "$processed reads processed\r"; | |
974 @line = split(); | |
975 $id = $line[0]; | |
976 $id =~ s/\@//; | |
977 }elsif($c == 2){ | |
978 $seq = $_; | |
979 }elsif($c == 4){ | |
980 $c = 0; | |
981 #print TGT ">$id\n$seq\n"; | |
982 # print TGT ">seq_$processed\n$seq\n"; | |
983 $read_hash{$seq}{$id} = 1; | |
984 $id =''; | |
985 @line =(); | |
986 $seq =''; | |
987 }else{} | |
988 } | |
989 | |
990 ## write TGT | |
991 my $count = 0; | |
992 for my $seq_out (sort keys %read_hash){ | |
993 $count++; | |
994 my @hits = keys %{$read_hash{$seq_out}}; | |
995 my $num = @hits+0; | |
996 my $id_out = "R".$count.'_'.$num; | |
997 print TGT ">$id_out\n", $seq_out ,"\n"; | |
998 } | |
999 | |
1000 close IN; | |
1001 close TGT; | |
1002 return $processed; | |
1003 } | |
1004 | |
1005 | |
1006 sub check_read_num{ | |
1007 my $file_in = shift @_; | |
1008 # check read number | |
1009 | |
1010 open FAS_in,"$file_in" || die "can not read fasta file\n"; | |
1011 my $seq_num = 0; | |
1012 while (<FAS_in>){ | |
1013 if( m/^\>/){ | |
1014 $seq_num ++; | |
1015 } | |
1016 if( $seq_num>1){ | |
1017 last; | |
1018 } | |
1019 } | |
1020 | |
1021 return $seq_num; | |
1022 | |
1023 } | |
1024 | |
1025 | |
1026 | |
1027 sub check_file_type { | |
1028 my $file_in = shift @_; | |
1029 my $out = 'no'; | |
1030 | |
1031 open FAS_in, "$file_in" || die "can not read fasta file\n"; | |
1032 | |
1033 while (my $line = <FAS_in>){ | |
1034 if($line =~ m/\w/){ | |
1035 if($line =~ m/^\>/){ | |
1036 $out = 'fa'; | |
1037 last; | |
1038 } | |
1039 if($line =~ m/^\@/){ | |
1040 $out = 'fq'; | |
1041 last; | |
1042 } | |
1043 } | |
1044 } | |
1045 | |
1046 return $out; | |
1047 } | |
1048 | |
1049 | |
1050 sub rev_com { | |
1051 my $seq = shift @_; | |
1052 $seq = reverse($seq); | |
1053 $seq =~ tr/ACGT/TGCA/; | |
1054 return($seq); | |
1055 } | |
1056 | |
1057 |