0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 my $file_design = $ARGV[0];
|
|
4 my $read_file = $ARGV[1];
|
|
5 my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
|
|
6 my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0
|
|
7 my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20
|
|
8 my $indel_report = $ARGV[5]; # report top xx records with indel, default 50
|
|
9 my $final_out = $ARGV[6];
|
|
10
|
|
11 #my $blat = ''; # working directory or PATH
|
|
12 my $blat = '/usr/local/bin/blat'; # Your prefered full location
|
|
13
|
|
14 my $rand = rand 1000000;
|
|
15 ## setting for reports
|
|
16 if(not defined ($mismatch_cutoff )){
|
|
17 $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
|
|
18 }
|
|
19
|
|
20 if(not defined ($min_cutoff )){
|
|
21 $min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0
|
|
22 }
|
|
23
|
|
24 if(not defined ($wt_like_report )){
|
|
25 $wt_like_report = 20 ; # report top xx WT like records, default 20
|
|
26 }
|
|
27
|
|
28 if(not defined ($indel_report )){
|
|
29 $indel_report = 50 ; # report top xx records with indel, default 50
|
|
30 }
|
|
31
|
|
32
|
|
33 my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1
|
|
34
|
|
35 ###############################################################
|
|
36 # default setting
|
|
37
|
|
38 my $remove = 'rm';
|
|
39 my $osname = $^O;
|
|
40
|
|
41 my $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa';
|
|
42 my @psl_file_data = ();
|
|
43
|
|
44 if(not defined ($file_design )){
|
|
45 die "Design file is needed\n";
|
|
46 }
|
|
47
|
|
48 if(not defined ($final_out )){
|
|
49 $final_out = 'AGE_output.txt';
|
|
50 }
|
|
51 ##########################################################
|
|
52 ### step 1 load design file
|
|
53
|
|
54 open DESIGN,"<$file_design" or die "File $file_design not found error here\n";
|
|
55 open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\n";
|
|
56
|
|
57 my $num = 0;
|
|
58 my %design_hash = ();
|
|
59
|
|
60
|
|
61 while (my $line = <DESIGN>) {
|
|
62 $num ++;
|
|
63 if($num == 1){
|
|
64 next;
|
|
65 }
|
|
66 if($line !~ m/\w/){
|
|
67 next;
|
|
68 }
|
|
69
|
|
70 chomp $line;
|
|
71 my @temp = split(/\t/,$line);
|
|
72 my $id = $temp[0];
|
|
73 my $seq = $temp[1];
|
|
74 $seq =~ s/\s+//g;
|
|
75 $seq =~ m/([^\r\n]+)/;
|
|
76 $seq = $1;
|
|
77 $id =~ s/\s+/\_/g;
|
|
78
|
|
79 $design_hash{$id} = $seq;
|
|
80 print DESIGNFAS ">$id\n$seq\n";
|
|
81 }
|
|
82 close(DESIGNFAS);
|
|
83
|
|
84
|
|
85 #######################################################
|
|
86 ############## step 2 - load read files #############
|
|
87
|
|
88 my %total_read_num = ();
|
|
89 my $num_c = 0;
|
|
90 my @fasta_files = ();
|
|
91
|
|
92 my $file_in = $read_file;
|
|
93 my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa';
|
|
94
|
|
95
|
|
96 my $type = check_file_type($file_in ) ;
|
|
97
|
|
98 if($type eq 'fq'){
|
|
99 # fastq
|
|
100 # print "this is fastq file\n";
|
|
101 my $total_num = &fastq2fasta($file_in,$fasta_out);
|
|
102 $total_read_num{$fasta_out} = $total_num;
|
|
103 }
|
|
104 elsif($type eq 'fa'){
|
|
105 # fasta
|
|
106 # print "this is fasta file\n";
|
|
107 my $total_num = &fasta2fasta($file_in,$fasta_out);
|
|
108 # load number
|
|
109 $total_read_num{$fasta_out} = $total_num;
|
|
110 }
|
|
111 else{
|
|
112 die "Please check the format of read file\n";
|
|
113 }
|
|
114
|
|
115
|
|
116 ########################################################
|
|
117 ## Here will put step3 to step 5 into one loop
|
|
118
|
|
119
|
|
120 open (REPORT,">$final_out") || die "cannot write\n";
|
|
121
|
|
122
|
|
123 my $file_blat_in = $fasta_out;
|
|
124
|
|
125 ############ step 3 - blat ###########################
|
|
126 my $blat_out = '/tmp/fasta'.$rand.'_blat_crispr.psl';
|
|
127
|
|
128 my $command_line = $blat.' '.' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '. $design_fas.' '.$blat_out .' >/tmp/blat.txt';
|
|
129
|
|
130 system("$command_line");
|
|
131
|
|
132 #print "blat job $file_blat_in is done\n";
|
|
133 ########## step 4 - convert psl -> bed ##############
|
|
134
|
|
135 my $bed_hash_address = &psl2bed ($blat_out ); ## address of one hash
|
|
136
|
|
137 ########## step 5 - get sequences number from bed ##############
|
|
138
|
|
139 my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash
|
|
140
|
|
141
|
|
142 ## remove files
|
|
143
|
|
144 if($remove_files == 1){
|
|
145 $command_line = $remove.' '.$file_blat_in;
|
|
146 system("$command_line");
|
|
147
|
|
148 $command_line = $remove.' '.$blat_out ;
|
|
149 system("$command_line");
|
|
150
|
|
151 my $command_line2 = $remove.' '.$design_fas ;
|
|
152 system("$command_line2");
|
|
153 }
|
|
154
|
|
155
|
|
156 #########################################################################################
|
|
157 ## functions
|
|
158
|
|
159
|
|
160 sub MAIN{
|
|
161 my $fas_file_in = shift @_;
|
|
162 my $bed_file_address_in = shift @_;
|
|
163
|
|
164 my %bed_hash_in = %{$bed_file_address_in};
|
|
165
|
|
166
|
|
167 #########################################################
|
|
168 # read convereted reads fasta file
|
|
169
|
|
170 open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in ";
|
|
171
|
|
172 my %query_with_seq2num = ();
|
|
173 my %seq2alignment = ();
|
|
174 my %seq_part2assignment = ();
|
|
175
|
|
176 while (my $lineIn =<FAS>){
|
|
177 chomp $lineIn;
|
|
178 if($lineIn =~ m/^\>/){
|
|
179 my $id = substr $lineIn,1;
|
|
180
|
|
181 my @numbers = split(/\_/, $id);
|
|
182 my $total_num = $numbers[-1];
|
|
183
|
|
184 $lineIn = <FAS>;
|
|
185 chomp $lineIn;
|
|
186 my $seq = $lineIn;
|
|
187
|
|
188 if(exists $bed_hash_in{$id}){
|
|
189 # $psl_hash{$read_target}{$query} # format of hash
|
|
190 ## asign read to target reference
|
|
191
|
|
192 ##########################################################
|
|
193 my @read_asignment = ();
|
|
194 ## loop for best
|
|
195 for my $read_key (keys %{$bed_hash_in{$id}}){
|
|
196 ### get annotation from array
|
|
197 ## Q: design T: read
|
|
198 my @get_array = @{$bed_hash_in{$id}{$read_key}};
|
|
199 my ($strand, $q_name,$q_size,$q_start,$q_end,
|
|
200 $t_name,$t_size,$t_start,$t_end,
|
|
201 $block_num,$block_size,$q_start_s,$t_start_s) = @get_array ;
|
|
202
|
|
203 my $seq_out = substr $seq,$t_start,($t_end-$t_start);
|
|
204
|
|
205 if($strand eq '-'){
|
|
206 $seq_out = &rev_com ($seq_out);
|
|
207 }
|
|
208 ## get orignal
|
|
209 my $ref_ori_seq = '';
|
|
210 if(exists $design_hash{$q_name}){
|
|
211 $ref_ori_seq = $design_hash{$q_name};
|
|
212 }
|
|
213
|
|
214
|
|
215 my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq);
|
|
216 # target # part of read
|
|
217 my ($align_ref,$align_read,$note) = split(/\t/, $edit_note);
|
|
218
|
|
219 my $len_ori = length($ref_ori_seq);
|
|
220 my @align_1 = split(//, $align_ref);
|
|
221 my @align_2 = split(//, $align_read);
|
|
222 my $site_snp = 0;
|
|
223 my @site_snp = ();
|
|
224 my $iden_num = 0;
|
|
225
|
|
226 for my $a1(@align_1){
|
|
227 my $a2 = shift @align_2;
|
|
228 if($a1 ne '-' and $a2 ne '-'){
|
|
229 if($a1 ne $a2){
|
|
230 push @site_snp, $site_snp;
|
|
231 }
|
|
232 }
|
|
233
|
|
234 if($a1 eq $a2){
|
|
235 $iden_num++;
|
|
236 }
|
|
237
|
|
238 $site_snp++;
|
|
239 }
|
|
240 $iden_num = $iden_num-2;
|
|
241
|
|
242 ## filter on SNP number
|
|
243 my $snp_num_for_filter = @site_snp+0;
|
|
244
|
|
245 my $dis_max = 0;
|
|
246 my $dis_current = 0;
|
|
247 my @continuous_site_temp = ();
|
|
248 my @continuous_site = ();
|
|
249 my @site_snp_checking = @site_snp;
|
|
250 my $start = shift @site_snp_checking ;
|
|
251 for my $site_checking (@site_snp_checking){
|
|
252 my $dis_in = $site_checking -$start;
|
|
253 if($dis_in <4){
|
|
254 $dis_current++;
|
|
255 push @continuous_site_temp,$site_checking ;
|
|
256 if($dis_current > $dis_max){
|
|
257 $dis_max = $dis_current;
|
|
258 @continuous_site = @continuous_site_temp;
|
|
259 }
|
|
260 }
|
|
261 else{
|
|
262 $dis_current = 0;
|
|
263 @continuous_site_temp = ();
|
|
264
|
|
265 }
|
|
266
|
|
267 $start = $site_checking;
|
|
268 }
|
|
269
|
|
270
|
|
271 if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering
|
|
272 if($dis_max <5){
|
|
273 next;
|
|
274 }
|
|
275 }
|
|
276
|
|
277
|
|
278 if($dis_max>=5){
|
|
279 @site_snp = ();
|
|
280 my @align_x = split(//, $align_read);
|
|
281 for my $iii(@continuous_site ){
|
|
282 $align_x[$iii] = 'N';
|
|
283 }
|
|
284 $align_read = join('',@align_x);
|
|
285 $note = 'strange editing';
|
|
286 my @ttt = ($align_ref,$align_read,$note) ;
|
|
287
|
|
288 $edit_note = join("\t", @ttt );
|
|
289 }
|
|
290
|
|
291
|
|
292
|
|
293
|
|
294 my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array);
|
|
295 push @read_asignment,[@ooo];
|
|
296 }## for loop for best hit of each fasta reads
|
|
297
|
|
298 ## get the best hits
|
|
299
|
|
300 if(@read_asignment<1){
|
|
301 next;
|
|
302 }
|
|
303
|
|
304 ## assign the first hit
|
|
305 @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ;
|
|
306 my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]};
|
|
307
|
|
308 my @snp_filtering = @{$array_snp_addr};
|
|
309
|
|
310 if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){
|
|
311 next;
|
|
312 }
|
|
313
|
|
314
|
|
315 ## after asignment
|
|
316
|
|
317 ## check whether seq_out is assigned earlier
|
|
318 ## $seq_out has been asigned to one record
|
|
319 ## just assign reads to that one althogh it may be get one new assignment
|
|
320 if( exists $seq_part2assignment{$seq_out}){
|
|
321 my $first_assignment = $seq_part2assignment{$seq_out};
|
|
322 $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num;
|
|
323 next;
|
|
324 }
|
|
325
|
|
326
|
|
327 # number for part of reads
|
|
328 my ($strand, $q_name,$q_size,$q_start,$q_end,
|
|
329 $t_name,$t_size,$t_start,$t_end,
|
|
330 $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr};
|
|
331
|
|
332 # this the first asignment
|
|
333 if(not exists $seq2alignment{$seq_out}){
|
|
334 my @out = ($ref_ori_seq,$edit_note,$array_snp_addr);
|
|
335 $seq2alignment{$seq_out} = \@out;
|
|
336
|
|
337 }
|
|
338 # assigned
|
|
339 if(not exists $seq_part2assignment{$seq_out}) {
|
|
340 $seq_part2assignment{$seq_out} = $q_name;
|
|
341 $query_with_seq2num{$q_name}{$seq_out} = $total_num;
|
|
342 }
|
|
343
|
|
344
|
|
345
|
|
346 }# exists bed file
|
|
347 }# lineIN
|
|
348 }# while file
|
|
349 close(FAS);
|
|
350
|
|
351 ### summary data for output
|
|
352 my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment);
|
|
353
|
|
354 ### write output
|
|
355
|
|
356 &write_buffer_out($hash_addr1, $hash_addr2);
|
|
357
|
|
358 } # Main function
|
|
359
|
|
360
|
|
361
|
|
362
|
|
363 sub get_out_buffer{
|
|
364 my $fas_file_in = $_[0];
|
|
365 my %query_with_seq2num_in = %{$_[1]};
|
|
366 my %seq2alignment_in = %{$_[2]};
|
|
367
|
|
368 my %buffer_out = ();
|
|
369 my %buffer_num = ();
|
|
370
|
|
371 my $total_non_redun = 0;
|
|
372 ## push output in buffer before printing
|
|
373 my $test_case_num = $total_read_num{$fas_file_in};
|
|
374
|
|
375 for my $ref_name (sort keys %query_with_seq2num_in){
|
|
376 my %in_hash = %{$query_with_seq2num_in{$ref_name}};
|
|
377
|
|
378 ## each group
|
|
379 my $report_wt_count = 0;
|
|
380 my $report_indel_count = 0;
|
|
381 my $other_num = 0;
|
|
382 my $sub_hit_num = 0;
|
|
383 my $indel_hit_num = 0;
|
|
384
|
|
385
|
|
386 my @other_record = ();
|
|
387 ### loop for records
|
|
388 for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){
|
|
389
|
|
390 my $hitnum = $in_hash{$part_of_read};
|
|
391
|
|
392 if($hitnum < $min_cutoff) {
|
|
393 next;
|
|
394 }
|
|
395
|
|
396 if($hitnum/$test_case_num < 0.00005) {
|
|
397 next;
|
|
398 }
|
|
399
|
|
400 ## get Editing Note for output only
|
|
401
|
|
402 if(exists $seq2alignment_in{$part_of_read}){
|
|
403 my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}};
|
|
404 my ($align_ref,$align_read,$note) = split(/\t/,$edit_note);
|
|
405 my @snp_out = @{$array_snp_addr};
|
|
406 my $snp_num = @snp_out +0;
|
|
407 # deep sequencing SNP
|
|
408 if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){
|
|
409 next;
|
|
410 }
|
|
411
|
|
412 my $snp_note = '';
|
|
413 if($snp_num>0 ){
|
|
414 $snp_note = $snp_num.' SNP('.join(',',@snp_out).')';
|
|
415 }
|
|
416
|
|
417 $sub_hit_num = $sub_hit_num+$hitnum;
|
|
418 $total_non_redun = $total_non_redun + $hitnum;
|
|
419
|
|
420
|
|
421 # output here
|
|
422 if($ref_ori_seq ne $design_hash{$ref_name} ){
|
|
423 print "error in read assingment \n";
|
|
424 }
|
|
425
|
|
426
|
|
427 my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note);
|
|
428 ## indel
|
|
429 if($edit_note !~ m/no.+indel/ ){
|
|
430
|
|
431 if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){
|
|
432 next; # filter strange case in deep sequencing with few reads
|
|
433 }
|
|
434
|
|
435
|
|
436 $report_indel_count ++;
|
|
437 # total indel
|
|
438 $indel_hit_num = $indel_hit_num + $hitnum;
|
|
439
|
|
440 if($report_indel_count <= $indel_report ){
|
|
441 #print REPORT join("\t",@out_line),"\n";
|
|
442 ## in buffer
|
|
443 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
|
|
444 my @bbb = ();
|
|
445 push @bbb, [@out_line];
|
|
446 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
|
|
447 }
|
|
448 else{
|
|
449 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
|
|
450 }
|
|
451 }
|
|
452 else{
|
|
453 $other_num = $other_num+$hitnum;
|
|
454 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
|
|
455 }
|
|
456
|
|
457 }
|
|
458 else{
|
|
459 $report_wt_count ++;
|
|
460 if($report_wt_count <= $wt_like_report ){
|
|
461 #print REPORT join("\t",@out_line),"\n";
|
|
462 # in buffer
|
|
463 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
|
|
464 my @bbb = ();
|
|
465 push @bbb, [@out_line];
|
|
466 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
|
|
467 }
|
|
468 else{
|
|
469 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
|
|
470 }
|
|
471 }
|
|
472 else{
|
|
473 $other_num = $other_num+$hitnum;
|
|
474 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
|
|
475 }
|
|
476
|
|
477 }# wt like report
|
|
478
|
|
479 } # if exists
|
|
480
|
|
481 }# for part of reads
|
|
482
|
|
483 ## other report
|
|
484 if($other_num>0){
|
|
485 $other_record[-5] = $other_num;
|
|
486 #print REPORT join("\t",@other_record),"\n";
|
|
487
|
|
488 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
|
|
489 my @bbb = ();
|
|
490 push @bbb, [@other_record];
|
|
491 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
|
|
492 }
|
|
493 else{
|
|
494 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record];
|
|
495 }
|
|
496 }
|
|
497
|
|
498 #my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num);
|
|
499 ## total read number
|
|
500 my $total_num = $total_read_num{$fas_file_in};
|
|
501
|
|
502 # 0 1 2
|
|
503 # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
|
|
504 my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
|
|
505
|
|
506 $buffer_out{$fas_file_in}{$ref_name}{'sum'} = \@summary_record ;
|
|
507 $buffer_num{$fas_file_in}{'total'} = $total_num ;
|
|
508 $buffer_num{$fas_file_in}{'sub'} = $total_non_redun ;
|
|
509 #print REPORT join("\t",@summary_record),"\n";
|
|
510
|
|
511 }# for ref_seq
|
|
512
|
|
513
|
|
514 return (\%buffer_out,\%buffer_num);
|
|
515 } # sub
|
|
516
|
|
517
|
|
518 ### write the output
|
|
519
|
|
520 sub write_buffer_out{
|
|
521 my %hash_out = %{$_[0]};
|
|
522 my %hash_out_num = %{$_[1]};
|
|
523 my $print_tracking = 0;
|
|
524 for my $fas_file_in (sort keys %hash_out){
|
|
525
|
|
526 my $total_num = $hash_out_num{$fas_file_in}{'total'} ;
|
|
527 my $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ;
|
|
528
|
|
529 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
|
7
|
530 if(not exists $hash_out{$fas_file_in}{$ref_name}{"data"}){
|
|
531 # print "No data for $fas_file_in $ref_name \n";
|
|
532 next;
|
|
533 }
|
|
534
|
0
|
535 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
|
|
536
|
|
537 for (@data){
|
|
538 $print_tracking++;
|
|
539 if($print_tracking==1){
|
|
540 print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t",
|
|
541 "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n";
|
|
542 }
|
|
543 my @ooo = @{$_};
|
|
544 $ooo[0] = 'Input1';
|
|
545 print REPORT join("\t", @ooo),"\n";
|
|
546 }
|
|
547
|
|
548 # sum
|
|
549 my @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
|
|
550
|
|
551
|
|
552 my @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun);
|
|
553
|
|
554 print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n";
|
|
555
|
|
556 }
|
|
557
|
|
558
|
|
559 ##### print summary region
|
|
560 print REPORT "Summary \t -----------------------------------------------------\n";
|
|
561 print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n";
|
|
562
|
|
563
|
|
564
|
|
565 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
|
7
|
566
|
|
567 if(not exists $hash_out{$fas_file_in}{$ref_name}{"data"}){
|
|
568 # print "No data for $fas_file_in $ref_name \n";
|
|
569 next;
|
|
570 }
|
0
|
571 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
|
|
572
|
|
573 my $wt_pair = ''."\t".'';
|
|
574 my $indel_pair = ''."\t".'';
|
|
575 my $indel_out = '';
|
|
576 my $i = 0;
|
|
577
|
|
578 for (@data){
|
|
579 $i++;
|
|
580 my @in= @{$_};
|
|
581
|
|
582
|
|
583 if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){
|
|
584 if($indel_pair eq ''."\t".''){
|
|
585 $indel_pair = $in[5]."\t".$in[6];
|
|
586 my @indel_temp = split(/\s+/,$in[7]);
|
|
587
|
|
588 my @ooo_indel = ();
|
|
589 for my $ooo_in (@indel_temp ){
|
|
590 if($ooo_in =~ m/I(\d+)/){
|
|
591 push @ooo_indel, '+'.$1;
|
|
592 }
|
|
593 if($ooo_in =~ m/D(\d+)/){
|
|
594 push @ooo_indel, '-'.$1;
|
|
595 }
|
|
596 if($ooo_in =~ m/strange/){
|
|
597 push @ooo_indel, 'strange editing';
|
|
598 }
|
|
599 }
|
|
600 $indel_out = 'Indel:'.join(',', @ooo_indel);
|
|
601 }
|
|
602 }
|
|
603 else{
|
|
604 # no indel
|
|
605 if($wt_pair eq ''."\t".''){
|
|
606 $wt_pair = $in[5]."\t".$in[6];
|
|
607 }
|
|
608 }
|
|
609 }
|
|
610
|
|
611 # first record
|
|
612
|
|
613
|
|
614
|
|
615 # sum
|
|
616 my ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
|
|
617 my $sub_num ;
|
|
618 my $indel_num;
|
|
619 #print $sub_string. "sub summary\n";
|
|
620 #print $indel_string ."sub summary\n";
|
|
621
|
|
622
|
|
623 if($sub_string =~ m/\:[^\d]+(\d+)/){
|
|
624 $sub_num = $1;
|
|
625 }
|
|
626 if($indel_string =~ m/\:[^\d]+(\d+)/){
|
|
627 $indel_num = $1;
|
|
628 }
|
|
629
|
|
630 my $rate = int(($indel_num/$sub_num)*10000)/100;
|
|
631
|
|
632 ## indel
|
|
633 # two columns
|
|
634 my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out);
|
|
635 print REPORT 'Sum:',join("\t", @out_sum),"\n";
|
|
636 ## wt like
|
|
637 my $wt_num = $sub_num - $indel_num;
|
|
638 my $wt_rate = 100-$rate;
|
|
639 my @out_sum = ("Input1",$ref_name, $wt_pair ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like");
|
|
640 print REPORT 'Sum:',join("\t", @out_sum),"\n";
|
|
641
|
|
642 } # records
|
|
643
|
|
644 print REPORT "\n";
|
|
645
|
|
646 } # file
|
|
647 } # sub
|
|
648 ###################################################################
|
|
649
|
|
650
|
|
651
|
|
652 sub get_editing_note{
|
|
653 my ($address_in,$query_seq, $target_seq ,$full_read) = @_;
|
|
654 # target: part of read
|
|
655 # query : original sequence in design file
|
|
656
|
|
657 my ($strand, $q_name,$q_size,$q_start,$q_end,
|
|
658 $t_name,$t_size,$t_start,$t_end,
|
|
659 $block_num,$block_size,$q_start_s,$t_start_s) = @{$address_in};
|
|
660
|
|
661 my @out = ();
|
|
662 #print $strand ,"strand\n";
|
|
663 if($strand eq '+'){
|
|
664 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read);
|
|
665 }
|
|
666 else{
|
|
667 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read);
|
|
668 @out = get_reverse(@out);
|
|
669 }
|
|
670
|
|
671 $out[0]='B'.$out[0].'E';
|
|
672 $out[1]='B'.$out[1].'E';
|
|
673 if($out[2] !~ m/\w/){
|
|
674 $out[2] = 'no_indel';
|
|
675 }
|
|
676 return join("\t",@out);
|
|
677 }
|
|
678
|
|
679
|
|
680
|
|
681 sub get_alignment{
|
|
682 my ($blocks , $blockLengths, $qStarts , $tStarts,$query_seq, $target_seq) = @_;
|
|
683 my @blockSizes = split(/\,/,$blockLengths);
|
|
684 my @qArray = split(/\,/,$qStarts);
|
|
685 my @tArray = split(/\,/,$tStarts);
|
|
686
|
|
687 # target: part of read
|
|
688 # query : original sequence in design file
|
|
689
|
|
690 # print join("\t",@_),"\n";
|
|
691
|
|
692 ### before blocks
|
|
693 my $before_q = substr $query_seq, 0,$qArray[0];
|
|
694 my $before_t = substr $target_seq,0,$tArray[0];
|
|
695
|
|
696
|
|
697 ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before");
|
|
698
|
|
699 ### after blocks
|
|
700 my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1];
|
|
701 my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1];
|
|
702
|
|
703
|
|
704 ($after_q,$after_t) = treat_sequence($after_q,$after_t,"after") ;
|
|
705
|
|
706 ### blocks
|
|
707 my @med_q = ();
|
|
708 my @med_t = ();
|
|
709
|
|
710 my @indel_out = ();
|
|
711 my $out = '';
|
|
712 ## first block
|
|
713 my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0];
|
|
714 my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0];
|
|
715 push @med_q,$med_q_seq;
|
|
716 push @med_t,$med_t_seq;
|
|
717
|
|
718 if($blocks>1){
|
|
719 for (my $i= 0;$i<($blocks-1);$i++){
|
|
720 ####### interval
|
|
721 my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i]));
|
|
722 my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i]));
|
|
723
|
|
724 ### count deletion insertion
|
|
725 if(length($inter_t_seq) - length($inter_q_seq)>0){
|
|
726 # to fix the mismatch near deletion length($inter_q_seq)
|
|
727 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq));
|
|
728 }
|
|
729 if(length($inter_q_seq)-length($inter_t_seq)>0){
|
|
730 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq));
|
|
731 }
|
|
732 push @indel_out,$out;
|
|
733
|
|
734 ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ;
|
|
735 push @med_q, $inter_q_seq;
|
|
736 push @med_t, $inter_t_seq;
|
|
737
|
|
738 ###### block after interval
|
|
739 $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1];
|
|
740 $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1];
|
|
741 push @med_q,$med_q_seq;
|
|
742 push @med_t,$med_t_seq;
|
|
743 }
|
|
744 }
|
|
745
|
|
746 push @med_q,$after_q;
|
|
747 push @med_t,$after_t;
|
|
748
|
|
749 unshift @med_q,$before_q;
|
|
750 unshift @med_t,$before_t;
|
|
751 my $q_string = join('',@med_q);
|
|
752 my $t_string = join('',@med_t);
|
|
753
|
|
754 if($q_string=~m/^(\-+)/){
|
|
755 my $len = length($1);
|
|
756 $q_string = substr $q_string,$len;
|
|
757 $t_string = substr $t_string, $len;
|
|
758 }
|
|
759 if($q_string=~m/(\-+)$/){
|
|
760 my $len = length($1);
|
|
761 $q_string = substr $q_string,0,(-1)*$len;
|
|
762 $t_string = substr $t_string,0, (-1)*$len;
|
|
763 }
|
|
764
|
|
765
|
|
766 return ($q_string,$t_string,join(' ',@indel_out));
|
|
767
|
|
768 }
|
|
769
|
|
770
|
|
771 sub get_reverse{
|
|
772 my ($q_string,$t_string,$indel_string )= @_;
|
|
773 $q_string = rev_com($q_string);
|
|
774 $t_string = rev_com($t_string);
|
|
775
|
|
776 my $string_test = $q_string;
|
|
777 $string_test =~ s/\-//g;
|
|
778 my $len = length($string_test);
|
|
779
|
|
780 my @indel_in = split(/\s/, $indel_string);
|
|
781 my @indel_out = ();
|
|
782 for (@indel_in){
|
|
783
|
|
784 if(m/^(\d+)(.)(\d+)/){
|
|
785 my $ooo = ($len-$1-$3).$2.$3;
|
|
786 if($2 eq 'I'){
|
|
787 $ooo = ($len-$1).$2.$3;
|
|
788 }
|
|
789
|
|
790 unshift @indel_out,$ooo;
|
|
791 }
|
|
792 }
|
|
793 $indel_string = join(' ',@indel_out);
|
|
794 return($q_string,$t_string, $indel_string);
|
|
795 }
|
|
796
|
|
797
|
|
798
|
|
799
|
|
800
|
|
801 sub treat_inter{
|
|
802 my ($q,$t) = @_;
|
|
803 my $q_len = length($q);
|
|
804 my $t_len = length($t);
|
|
805 my $dis = abs($q_len-$t_len);
|
|
806 my @oooo = ();
|
|
807
|
|
808 for(1..$dis){
|
|
809 push @oooo,'-';
|
|
810 }
|
|
811
|
|
812 if($q_len-$t_len >0){
|
|
813 $t = $t.join('',@oooo);
|
|
814 }
|
|
815 else{
|
|
816 $q = $q.join('',@oooo);
|
|
817 }
|
|
818 return ($q,$t);
|
|
819 }
|
|
820
|
|
821
|
|
822
|
|
823 sub treat_sequence{
|
|
824 my ($q,$t,$position) = @_;
|
|
825 my $q_len= length($q);
|
|
826 my $t_len = length($t);
|
|
827 my $dis = abs($q_len-$t_len);
|
|
828
|
|
829 if($dis > 0){
|
|
830 my @oooo = ();
|
|
831 for(1..$dis){
|
|
832 push @oooo,'-';
|
|
833 }
|
|
834 if($q_len>$t_len){
|
|
835 if($position eq 'before'){
|
|
836 $t = join('',@oooo).$t ;
|
|
837 }
|
|
838 else{
|
|
839 $t = $t.join('',@oooo) ;
|
|
840 }
|
|
841 }
|
|
842 else{ # q small
|
|
843 if($position eq 'before'){
|
|
844 $q = join('',@oooo).$q ;
|
|
845 }
|
|
846 else{
|
|
847 $q = $q.join('',@oooo) ;
|
|
848 }
|
|
849 }
|
|
850 }
|
|
851 return ($q,$t);
|
|
852 }
|
|
853
|
|
854
|
|
855 sub psl2bed {
|
|
856 my $psl_file = shift @_; # file in @psl_file_data
|
|
857 my @psl_array = ();
|
|
858
|
|
859
|
|
860 open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n";
|
|
861
|
|
862 ## should be correct
|
|
863
|
|
864 for(1..5){
|
|
865 my $line=<FILEINBLAT>;
|
|
866 ## remove head
|
|
867 }
|
|
868 ## read psl
|
|
869 while (<FILEINBLAT>)
|
|
870 {
|
|
871 my $lineIn = $_;
|
|
872 chomp $lineIn;
|
|
873 my @lineArr = split ("\t", $lineIn);
|
|
874
|
|
875 my $q_size = $lineArr[10];
|
|
876 my $m_size = $lineArr[0];
|
|
877
|
|
878 if( $m_size/$q_size > 0.20){
|
|
879 push @psl_array,[@lineArr];
|
|
880 }
|
|
881
|
|
882 }# while file
|
|
883 close(FILEINBLAT);
|
|
884 ## score large to small
|
|
885
|
|
886 #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ;
|
|
887
|
|
888 my @out_array = ();
|
|
889 my $psl_line_num = 0;
|
|
890 my %psl_hash = ();
|
|
891 for (@psl_array)
|
|
892 {
|
|
893 my @lineArr = @{$_};
|
|
894 $psl_line_num++;
|
|
895 my $read_target = $lineArr[13];
|
|
896 my $query = $lineArr[9];
|
|
897 for(1..8){
|
|
898 shift @lineArr;
|
|
899 }
|
|
900
|
|
901 if(not exists $psl_hash{$read_target}{$query}){ ## all alignment
|
|
902 #if(not exists $psl_hash{$read_target}){ ## only keep the best hit
|
|
903 $psl_hash{$read_target}{$query} = \@lineArr;
|
|
904 }
|
|
905 }# while array
|
|
906 #print $psl_line_num." total psl lines\n";
|
|
907 return (\%psl_hash);
|
|
908 }
|
|
909
|
|
910
|
|
911
|
|
912
|
|
913 sub fasta2fasta {
|
|
914 my $input_file = shift @_;
|
|
915 my $out_file = shift @_;
|
|
916
|
|
917 open IN,"<$input_file " or die "File $input_file not found error here\n";
|
|
918 open TGT,">$out_file" or die "File $out_file not found error here\n";
|
|
919 my $c = 0;
|
|
920 my @line = ();
|
|
921 my $id ='';
|
|
922 my $seq = '';
|
|
923 my $processed = 0;
|
|
924
|
|
925 while(<IN>){
|
|
926 chomp;
|
|
927 if(m/^\>/){
|
|
928 $processed ++;
|
|
929 if($processed == 1){
|
|
930 my @seq_id = split(/\s+/, $_);
|
|
931 my $seq_id = $seq_id[0];
|
|
932 $seq_id =~ s/\_//g;
|
|
933 $seq_id = $seq_id.'_1';
|
|
934 print TGT '>'.$seq_id,"\n";
|
|
935 }
|
|
936 else{
|
|
937 $seq =~ s/\s|\n|\r//g;
|
|
938 print TGT $seq,"\n";
|
|
939 $seq = '';
|
|
940 my @seq_id = split(/\s+/, $_);
|
|
941 my $seq_id = $seq_id[0];
|
|
942 $seq_id =~ s/\_//g;
|
|
943 $seq_id = $seq_id.'_1';
|
|
944 print TGT '>'.$seq_id,"\n";
|
|
945 }
|
|
946
|
|
947 }
|
|
948 else{
|
|
949 $seq = $seq.$_;
|
|
950 }
|
|
951 }
|
|
952 # last records
|
|
953 $seq =~ s/\s|\n|\r//g;
|
|
954 print TGT $seq,"\n";
|
|
955
|
|
956 close IN;
|
|
957 close TGT;
|
|
958 return $processed;
|
|
959 }
|
|
960
|
|
961
|
|
962
|
|
963 sub fastq2fasta {
|
|
964 my $input_file = shift @_;
|
|
965 my $out_file = shift @_;
|
|
966
|
|
967 open IN,"<$input_file " or die "File $input_file not found error here\n";
|
|
968 open TGT,">$out_file" or die "File $out_file not found error here\n";
|
|
969 my $c = 0;
|
|
970 my @line = ();
|
|
971 my $id ='';
|
|
972 my $seq = '';
|
|
973 my $processed = 0;
|
|
974 my %read_hash = ();
|
|
975
|
|
976 ##############################################################################################
|
|
977
|
|
978 while(<IN>){
|
|
979 chomp;
|
|
980 $c++;
|
|
981 if($c == 1){
|
|
982 $processed++;
|
|
983 # print STDERR "$processed reads processed\r";
|
|
984 @line = split();
|
|
985 $id = $line[0];
|
|
986 $id =~ s/\@//;
|
|
987 }elsif($c == 2){
|
|
988 $seq = $_;
|
|
989 }elsif($c == 4){
|
|
990 $c = 0;
|
|
991 #print TGT ">$id\n$seq\n";
|
|
992 # print TGT ">seq_$processed\n$seq\n";
|
|
993 $read_hash{$seq}{$id} = 1;
|
|
994 $id ='';
|
|
995 @line =();
|
|
996 $seq ='';
|
|
997 }else{}
|
|
998 }
|
|
999
|
|
1000 ## write TGT
|
|
1001 my $count = 0;
|
|
1002 for my $seq_out (sort keys %read_hash){
|
|
1003 $count++;
|
|
1004 my @hits = keys %{$read_hash{$seq_out}};
|
|
1005 my $num = @hits+0;
|
|
1006 my $id_out = "R".$count.'_'.$num;
|
|
1007 print TGT ">$id_out\n", $seq_out ,"\n";
|
|
1008 }
|
|
1009
|
|
1010 close IN;
|
|
1011 close TGT;
|
|
1012 return $processed;
|
|
1013 }
|
|
1014
|
|
1015
|
|
1016 sub check_read_num{
|
|
1017 my $file_in = shift @_;
|
|
1018 # check read number
|
|
1019
|
|
1020 open FAS_in,"$file_in" || die "can not read fasta file\n";
|
|
1021 my $seq_num = 0;
|
|
1022 while (<FAS_in>){
|
|
1023 if( m/^\>/){
|
|
1024 $seq_num ++;
|
|
1025 }
|
|
1026 if( $seq_num>1){
|
|
1027 last;
|
|
1028 }
|
|
1029 }
|
|
1030
|
|
1031 return $seq_num;
|
|
1032
|
|
1033 }
|
|
1034
|
|
1035
|
|
1036
|
|
1037 sub check_file_type {
|
|
1038 my $file_in = shift @_;
|
|
1039 my $out = 'no';
|
|
1040
|
|
1041 open FAS_in, "$file_in" || die "can not read fasta file\n";
|
|
1042
|
|
1043 while (my $line = <FAS_in>){
|
|
1044 if($line =~ m/\w/){
|
|
1045 if($line =~ m/^\>/){
|
|
1046 $out = 'fa';
|
|
1047 last;
|
|
1048 }
|
|
1049 if($line =~ m/^\@/){
|
|
1050 $out = 'fq';
|
|
1051 last;
|
|
1052 }
|
|
1053 }
|
|
1054 }
|
|
1055
|
|
1056 return $out;
|
|
1057 }
|
|
1058
|
|
1059
|
|
1060 sub rev_com {
|
|
1061 my $seq = shift @_;
|
|
1062 $seq = reverse($seq);
|
|
1063 $seq =~ tr/ACGT/TGCA/;
|
|
1064 return($seq);
|
|
1065 }
|
|
1066
|
|
1067
|