Mercurial > repos > lxue > ageseq
changeset 0:a9c5e846dd76 draft
Perl code
| author | lxue | 
|---|---|
| date | Fri, 15 May 2015 16:37:02 -0400 | 
| parents | |
| children | fafe271a0842 | 
| files | AGEseq_web.pl | 
| diffstat | 1 files changed, 1057 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AGEseq_web.pl Fri May 15 16:37:02 2015 -0400 @@ -0,0 +1,1057 @@ +#!/usr/bin/perl +use strict; +my $file_design = $ARGV[0]; +my $read_file = $ARGV[1]; +my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %) +my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0 +my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20 +my $indel_report = $ARGV[5]; # report top xx records with indel, default 50 +my $final_out = $ARGV[6]; + +#my $blat = ''; # working directory or PATH +my $blat = '/usr/local/bin/blat'; # Your prefered full location + +my $rand = rand 1000000; +## setting for reports +if(not defined ($mismatch_cutoff )){ + $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %) +} + +if(not defined ($min_cutoff )){ + $min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0 +} + +if(not defined ($wt_like_report )){ + $wt_like_report = 20 ; # report top xx WT like records, default 20 +} + +if(not defined ($indel_report )){ + $indel_report = 50 ; # report top xx records with indel, default 50 +} + + +my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1 + +############################################################### +# default setting + +my $remove = 'rm'; +my $osname = $^O; + +my $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa'; +my @psl_file_data = (); + +if(not defined ($file_design )){ + die "Design file is needed\n"; +} + +if(not defined ($final_out )){ + $final_out = 'AGE_output.txt'; +} +########################################################## +### step 1 load design file + +open DESIGN,"<$file_design" or die "File $file_design not found error here\n"; +open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\n"; + +my $num = 0; +my %design_hash = (); + + +while (my $line = <DESIGN>) { + $num ++; + if($num == 1){ + next; + } + if($line !~ m/\w/){ + next; + } + + chomp $line; + my @temp = split(/\t/,$line); + my $id = $temp[0]; + my $seq = $temp[1]; + $seq =~ s/\s+//g; + $seq =~ m/([^\r\n]+)/; + $seq = $1; + $id =~ s/\s+/\_/g; + + $design_hash{$id} = $seq; + print DESIGNFAS ">$id\n$seq\n"; +} +close(DESIGNFAS); + + +####################################################### +############## step 2 - load read files ############# + +my %total_read_num = (); +my $num_c = 0; +my @fasta_files = (); + +my $file_in = $read_file; +my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa'; + + +my $type = check_file_type($file_in ) ; + +if($type eq 'fq'){ + # fastq + # print "this is fastq file\n"; + my $total_num = &fastq2fasta($file_in,$fasta_out); + $total_read_num{$fasta_out} = $total_num; +} +elsif($type eq 'fa'){ + # fasta + # print "this is fasta file\n"; + my $total_num = &fasta2fasta($file_in,$fasta_out); + # load number + $total_read_num{$fasta_out} = $total_num; +} +else{ + die "Please check the format of read file\n"; +} + + +######################################################## +## Here will put step3 to step 5 into one loop + + +open (REPORT,">$final_out") || die "cannot write\n"; + + + my $file_blat_in = $fasta_out; + + ############ step 3 - blat ########################### + my $blat_out = '/tmp/fasta'.$rand.'_blat_crispr.psl'; + + my $command_line = $blat.' '.' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '. $design_fas.' '.$blat_out .' >/tmp/blat.txt'; + + system("$command_line"); + + #print "blat job $file_blat_in is done\n"; + ########## step 4 - convert psl -> bed ############## + + my $bed_hash_address = &psl2bed ($blat_out ); ## address of one hash + + ########## step 5 - get sequences number from bed ############## + + my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash + + + ## remove files + +if($remove_files == 1){ + $command_line = $remove.' '.$file_blat_in; + system("$command_line"); + + $command_line = $remove.' '.$blat_out ; + system("$command_line"); + + my $command_line2 = $remove.' '.$design_fas ; + system("$command_line2"); +} + + +######################################################################################### +## functions + + +sub MAIN{ + my $fas_file_in = shift @_; + my $bed_file_address_in = shift @_; + + my %bed_hash_in = %{$bed_file_address_in}; + + + ######################################################### + # read convereted reads fasta file + + open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in "; + + my %query_with_seq2num = (); + my %seq2alignment = (); + my %seq_part2assignment = (); + + while (my $lineIn =<FAS>){ + chomp $lineIn; + if($lineIn =~ m/^\>/){ + my $id = substr $lineIn,1; + + my @numbers = split(/\_/, $id); + my $total_num = $numbers[-1]; + + $lineIn = <FAS>; + chomp $lineIn; + my $seq = $lineIn; + + if(exists $bed_hash_in{$id}){ + # $psl_hash{$read_target}{$query} # format of hash + ## asign read to target reference + + ########################################################## + my @read_asignment = (); + ## loop for best + for my $read_key (keys %{$bed_hash_in{$id}}){ + ### get annotation from array + ## Q: design T: read + my @get_array = @{$bed_hash_in{$id}{$read_key}}; + my ($strand, $q_name,$q_size,$q_start,$q_end, + $t_name,$t_size,$t_start,$t_end, + $block_num,$block_size,$q_start_s,$t_start_s) = @get_array ; + + my $seq_out = substr $seq,$t_start,($t_end-$t_start); + + if($strand eq '-'){ + $seq_out = &rev_com ($seq_out); + } + ## get orignal + my $ref_ori_seq = ''; + if(exists $design_hash{$q_name}){ + $ref_ori_seq = $design_hash{$q_name}; + } + + + my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq); + # target # part of read + my ($align_ref,$align_read,$note) = split(/\t/, $edit_note); + + my $len_ori = length($ref_ori_seq); + my @align_1 = split(//, $align_ref); + my @align_2 = split(//, $align_read); + my $site_snp = 0; + my @site_snp = (); + my $iden_num = 0; + + for my $a1(@align_1){ + my $a2 = shift @align_2; + if($a1 ne '-' and $a2 ne '-'){ + if($a1 ne $a2){ + push @site_snp, $site_snp; + } + } + + if($a1 eq $a2){ + $iden_num++; + } + + $site_snp++; + } + $iden_num = $iden_num-2; + + ## filter on SNP number + my $snp_num_for_filter = @site_snp+0; + + my $dis_max = 0; + my $dis_current = 0; + my @continuous_site_temp = (); + my @continuous_site = (); + my @site_snp_checking = @site_snp; + my $start = shift @site_snp_checking ; + for my $site_checking (@site_snp_checking){ + my $dis_in = $site_checking -$start; + if($dis_in <4){ + $dis_current++; + push @continuous_site_temp,$site_checking ; + if($dis_current > $dis_max){ + $dis_max = $dis_current; + @continuous_site = @continuous_site_temp; + } + } + else{ + $dis_current = 0; + @continuous_site_temp = (); + + } + + $start = $site_checking; + } + + + if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering + if($dis_max <5){ + next; + } + } + + + if($dis_max>=5){ + @site_snp = (); + my @align_x = split(//, $align_read); + for my $iii(@continuous_site ){ + $align_x[$iii] = 'N'; + } + $align_read = join('',@align_x); + $note = 'strange editing'; + my @ttt = ($align_ref,$align_read,$note) ; + + $edit_note = join("\t", @ttt ); + } + + + + + my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array); + push @read_asignment,[@ooo]; + }## for loop for best hit of each fasta reads + + ## get the best hits + + if(@read_asignment<1){ + next; + } + + ## assign the first hit + @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ; + my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]}; + + my @snp_filtering = @{$array_snp_addr}; + + if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){ + next; + } + + + ## after asignment + + ## check whether seq_out is assigned earlier + ## $seq_out has been asigned to one record + ## just assign reads to that one althogh it may be get one new assignment + if( exists $seq_part2assignment{$seq_out}){ + my $first_assignment = $seq_part2assignment{$seq_out}; + $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num; + next; + } + + + # number for part of reads + my ($strand, $q_name,$q_size,$q_start,$q_end, + $t_name,$t_size,$t_start,$t_end, + $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr}; + + # this the first asignment + if(not exists $seq2alignment{$seq_out}){ + my @out = ($ref_ori_seq,$edit_note,$array_snp_addr); + $seq2alignment{$seq_out} = \@out; + + } + # assigned + if(not exists $seq_part2assignment{$seq_out}) { + $seq_part2assignment{$seq_out} = $q_name; + $query_with_seq2num{$q_name}{$seq_out} = $total_num; + } + + + + }# exists bed file + }# lineIN + }# while file + close(FAS); + + ### summary data for output + my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment); + + ### write output + + &write_buffer_out($hash_addr1, $hash_addr2); + +} # Main function + + + + +sub get_out_buffer{ + my $fas_file_in = $_[0]; + my %query_with_seq2num_in = %{$_[1]}; + my %seq2alignment_in = %{$_[2]}; + + my %buffer_out = (); + my %buffer_num = (); + + my $total_non_redun = 0; + ## push output in buffer before printing + my $test_case_num = $total_read_num{$fas_file_in}; + + for my $ref_name (sort keys %query_with_seq2num_in){ + my %in_hash = %{$query_with_seq2num_in{$ref_name}}; + + ## each group + my $report_wt_count = 0; + my $report_indel_count = 0; + my $other_num = 0; + my $sub_hit_num = 0; + my $indel_hit_num = 0; + + + my @other_record = (); + ### loop for records + for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){ + + my $hitnum = $in_hash{$part_of_read}; + + if($hitnum < $min_cutoff) { + next; + } + + if($hitnum/$test_case_num < 0.00005) { + next; + } + + ## get Editing Note for output only + + if(exists $seq2alignment_in{$part_of_read}){ + my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}}; + my ($align_ref,$align_read,$note) = split(/\t/,$edit_note); + my @snp_out = @{$array_snp_addr}; + my $snp_num = @snp_out +0; + # deep sequencing SNP + if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){ + next; + } + + my $snp_note = ''; + if($snp_num>0 ){ + $snp_note = $snp_num.' SNP('.join(',',@snp_out).')'; + } + + $sub_hit_num = $sub_hit_num+$hitnum; + $total_non_redun = $total_non_redun + $hitnum; + + + # output here + if($ref_ori_seq ne $design_hash{$ref_name} ){ + print "error in read assingment \n"; + } + + + my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note); + ## indel + if($edit_note !~ m/no.+indel/ ){ + + if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){ + next; # filter strange case in deep sequencing with few reads + } + + + $report_indel_count ++; + # total indel + $indel_hit_num = $indel_hit_num + $hitnum; + + if($report_indel_count <= $indel_report ){ + #print REPORT join("\t",@out_line),"\n"; + ## in buffer + if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ + my @bbb = (); + push @bbb, [@out_line]; + $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; + } + else{ + push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line]; + } + } + else{ + $other_num = $other_num+$hitnum; + @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-'); + } + + } + else{ + $report_wt_count ++; + if($report_wt_count <= $wt_like_report ){ + #print REPORT join("\t",@out_line),"\n"; + # in buffer + if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ + my @bbb = (); + push @bbb, [@out_line]; + $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; + } + else{ + push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line]; + } + } + else{ + $other_num = $other_num+$hitnum; + @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-'); + } + + }# wt like report + + } # if exists + + }# for part of reads + + ## other report + if($other_num>0){ + $other_record[-5] = $other_num; + #print REPORT join("\t",@other_record),"\n"; + + if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){ + my @bbb = (); + push @bbb, [@other_record]; + $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb; + } + else{ + push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record]; + } + } + + #my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num); + ## total read number + my $total_num = $total_read_num{$fas_file_in}; + + # 0 1 2 + # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num); + my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num); + + $buffer_out{$fas_file_in}{$ref_name}{'sum'} = \@summary_record ; + $buffer_num{$fas_file_in}{'total'} = $total_num ; + $buffer_num{$fas_file_in}{'sub'} = $total_non_redun ; + #print REPORT join("\t",@summary_record),"\n"; + + }# for ref_seq + + + return (\%buffer_out,\%buffer_num); +} # sub + + +### write the output + +sub write_buffer_out{ + my %hash_out = %{$_[0]}; + my %hash_out_num = %{$_[1]}; + my $print_tracking = 0; + for my $fas_file_in (sort keys %hash_out){ + + my $total_num = $hash_out_num{$fas_file_in}{'total'} ; + my $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ; + + for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){ + my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}}; + + for (@data){ + $print_tracking++; + if($print_tracking==1){ + print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t", + "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n"; + } + my @ooo = @{$_}; + $ooo[0] = 'Input1'; + print REPORT join("\t", @ooo),"\n"; + } + + # sum + my @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ; + + + my @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun); + + print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n"; + + } + + + ##### print summary region + print REPORT "Summary \t -----------------------------------------------------\n"; + print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n"; + + + + for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){ + my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}}; + + my $wt_pair = ''."\t".''; + my $indel_pair = ''."\t".''; + my $indel_out = ''; + my $i = 0; + + for (@data){ + $i++; + my @in= @{$_}; + + + if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){ + if($indel_pair eq ''."\t".''){ + $indel_pair = $in[5]."\t".$in[6]; + my @indel_temp = split(/\s+/,$in[7]); + + my @ooo_indel = (); + for my $ooo_in (@indel_temp ){ + if($ooo_in =~ m/I(\d+)/){ + push @ooo_indel, '+'.$1; + } + if($ooo_in =~ m/D(\d+)/){ + push @ooo_indel, '-'.$1; + } + if($ooo_in =~ m/strange/){ + push @ooo_indel, 'strange editing'; + } + } + $indel_out = 'Indel:'.join(',', @ooo_indel); + } + } + else{ + # no indel + if($wt_pair eq ''."\t".''){ + $wt_pair = $in[5]."\t".$in[6]; + } + } + } + + # first record + + + + # sum + my ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ; + my $sub_num ; + my $indel_num; + #print $sub_string. "sub summary\n"; + #print $indel_string ."sub summary\n"; + + + if($sub_string =~ m/\:[^\d]+(\d+)/){ + $sub_num = $1; + } + if($indel_string =~ m/\:[^\d]+(\d+)/){ + $indel_num = $1; + } + + my $rate = int(($indel_num/$sub_num)*10000)/100; + + ## indel + # two columns + my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out); + print REPORT 'Sum:',join("\t", @out_sum),"\n"; + ## wt like + my $wt_num = $sub_num - $indel_num; + my $wt_rate = 100-$rate; + my @out_sum = ("Input1",$ref_name, $wt_pair ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like"); + print REPORT 'Sum:',join("\t", @out_sum),"\n"; + + } # records + + print REPORT "\n"; + + } # file +} # sub +################################################################### + + + +sub get_editing_note{ + my ($address_in,$query_seq, $target_seq ,$full_read) = @_; + # target: part of read + # query : original sequence in design file + + my ($strand, $q_name,$q_size,$q_start,$q_end, + $t_name,$t_size,$t_start,$t_end, + $block_num,$block_size,$q_start_s,$t_start_s) = @{$address_in}; + + my @out = (); + #print $strand ,"strand\n"; + if($strand eq '+'){ + @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read); + } + else{ + @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read); + @out = get_reverse(@out); + } + + $out[0]='B'.$out[0].'E'; + $out[1]='B'.$out[1].'E'; + if($out[2] !~ m/\w/){ + $out[2] = 'no_indel'; + } + return join("\t",@out); +} + + + +sub get_alignment{ + my ($blocks , $blockLengths, $qStarts , $tStarts,$query_seq, $target_seq) = @_; + my @blockSizes = split(/\,/,$blockLengths); + my @qArray = split(/\,/,$qStarts); + my @tArray = split(/\,/,$tStarts); + + # target: part of read + # query : original sequence in design file + + # print join("\t",@_),"\n"; + + ### before blocks + my $before_q = substr $query_seq, 0,$qArray[0]; + my $before_t = substr $target_seq,0,$tArray[0]; + + + ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before"); + + ### after blocks + my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1]; + my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1]; + + + ($after_q,$after_t) = treat_sequence($after_q,$after_t,"after") ; + + ### blocks + my @med_q = (); + my @med_t = (); + + my @indel_out = (); + my $out = ''; + ## first block + my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0]; + my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0]; + push @med_q,$med_q_seq; + push @med_t,$med_t_seq; + + if($blocks>1){ + for (my $i= 0;$i<($blocks-1);$i++){ + ####### interval + my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i])); + my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i])); + + ### count deletion insertion + if(length($inter_t_seq) - length($inter_q_seq)>0){ + # to fix the mismatch near deletion length($inter_q_seq) + $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq)); + } + if(length($inter_q_seq)-length($inter_t_seq)>0){ + $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq)); + } + push @indel_out,$out; + + ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ; + push @med_q, $inter_q_seq; + push @med_t, $inter_t_seq; + + ###### block after interval + $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1]; + $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1]; + push @med_q,$med_q_seq; + push @med_t,$med_t_seq; + } + } + + push @med_q,$after_q; + push @med_t,$after_t; + + unshift @med_q,$before_q; + unshift @med_t,$before_t; + my $q_string = join('',@med_q); + my $t_string = join('',@med_t); + + if($q_string=~m/^(\-+)/){ + my $len = length($1); + $q_string = substr $q_string,$len; + $t_string = substr $t_string, $len; + } + if($q_string=~m/(\-+)$/){ + my $len = length($1); + $q_string = substr $q_string,0,(-1)*$len; + $t_string = substr $t_string,0, (-1)*$len; + } + + + return ($q_string,$t_string,join(' ',@indel_out)); + +} + + +sub get_reverse{ + my ($q_string,$t_string,$indel_string )= @_; + $q_string = rev_com($q_string); + $t_string = rev_com($t_string); + + my $string_test = $q_string; + $string_test =~ s/\-//g; + my $len = length($string_test); + + my @indel_in = split(/\s/, $indel_string); + my @indel_out = (); + for (@indel_in){ + + if(m/^(\d+)(.)(\d+)/){ + my $ooo = ($len-$1-$3).$2.$3; + if($2 eq 'I'){ + $ooo = ($len-$1).$2.$3; + } + + unshift @indel_out,$ooo; + } + } + $indel_string = join(' ',@indel_out); + return($q_string,$t_string, $indel_string); +} + + + + + +sub treat_inter{ + my ($q,$t) = @_; + my $q_len = length($q); + my $t_len = length($t); + my $dis = abs($q_len-$t_len); + my @oooo = (); + + for(1..$dis){ + push @oooo,'-'; + } + + if($q_len-$t_len >0){ + $t = $t.join('',@oooo); + } + else{ + $q = $q.join('',@oooo); + } + return ($q,$t); +} + + + +sub treat_sequence{ + my ($q,$t,$position) = @_; + my $q_len= length($q); + my $t_len = length($t); + my $dis = abs($q_len-$t_len); + + if($dis > 0){ + my @oooo = (); + for(1..$dis){ + push @oooo,'-'; + } + if($q_len>$t_len){ + if($position eq 'before'){ + $t = join('',@oooo).$t ; + } + else{ + $t = $t.join('',@oooo) ; + } + } + else{ # q small + if($position eq 'before'){ + $q = join('',@oooo).$q ; + } + else{ + $q = $q.join('',@oooo) ; + } + } + } + return ($q,$t); +} + + +sub psl2bed { + my $psl_file = shift @_; # file in @psl_file_data + my @psl_array = (); + + + open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n"; + + ## should be correct + + for(1..5){ + my $line=<FILEINBLAT>; + ## remove head + } + ## read psl + while (<FILEINBLAT>) + { + my $lineIn = $_; + chomp $lineIn; + my @lineArr = split ("\t", $lineIn); + + my $q_size = $lineArr[10]; + my $m_size = $lineArr[0]; + + if( $m_size/$q_size > 0.20){ + push @psl_array,[@lineArr]; + } + + }# while file + close(FILEINBLAT); + ## score large to small + + #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ; + + my @out_array = (); + my $psl_line_num = 0; + my %psl_hash = (); + for (@psl_array) + { + my @lineArr = @{$_}; + $psl_line_num++; + my $read_target = $lineArr[13]; + my $query = $lineArr[9]; + for(1..8){ + shift @lineArr; + } + + if(not exists $psl_hash{$read_target}{$query}){ ## all alignment + #if(not exists $psl_hash{$read_target}){ ## only keep the best hit + $psl_hash{$read_target}{$query} = \@lineArr; + } + }# while array + #print $psl_line_num." total psl lines\n"; + return (\%psl_hash); +} + + + + +sub fasta2fasta { + my $input_file = shift @_; + my $out_file = shift @_; + + open IN,"<$input_file " or die "File $input_file not found error here\n"; + open TGT,">$out_file" or die "File $out_file not found error here\n"; + my $c = 0; + my @line = (); + my $id =''; + my $seq = ''; + my $processed = 0; + + while(<IN>){ + chomp; + if(m/^\>/){ + $processed ++; + if($processed == 1){ + my @seq_id = split(/\s+/, $_); + my $seq_id = $seq_id[0]; + $seq_id =~ s/\_//g; + $seq_id = $seq_id.'_1'; + print TGT '>'.$seq_id,"\n"; + } + else{ + $seq =~ s/\s|\n|\r//g; + print TGT $seq,"\n"; + $seq = ''; + my @seq_id = split(/\s+/, $_); + my $seq_id = $seq_id[0]; + $seq_id =~ s/\_//g; + $seq_id = $seq_id.'_1'; + print TGT '>'.$seq_id,"\n"; + } + + } + else{ + $seq = $seq.$_; + } + } + # last records + $seq =~ s/\s|\n|\r//g; + print TGT $seq,"\n"; + + close IN; + close TGT; + return $processed; +} + + + +sub fastq2fasta { + my $input_file = shift @_; + my $out_file = shift @_; + + open IN,"<$input_file " or die "File $input_file not found error here\n"; + open TGT,">$out_file" or die "File $out_file not found error here\n"; + my $c = 0; + my @line = (); + my $id =''; + my $seq = ''; + my $processed = 0; + my %read_hash = (); + + ############################################################################################## + + while(<IN>){ + chomp; + $c++; + if($c == 1){ + $processed++; + # print STDERR "$processed reads processed\r"; + @line = split(); + $id = $line[0]; + $id =~ s/\@//; + }elsif($c == 2){ + $seq = $_; + }elsif($c == 4){ + $c = 0; + #print TGT ">$id\n$seq\n"; + # print TGT ">seq_$processed\n$seq\n"; + $read_hash{$seq}{$id} = 1; + $id =''; + @line =(); + $seq =''; + }else{} + } + + ## write TGT + my $count = 0; + for my $seq_out (sort keys %read_hash){ + $count++; + my @hits = keys %{$read_hash{$seq_out}}; + my $num = @hits+0; + my $id_out = "R".$count.'_'.$num; + print TGT ">$id_out\n", $seq_out ,"\n"; + } + + close IN; + close TGT; + return $processed; +} + + +sub check_read_num{ + my $file_in = shift @_; + # check read number + + open FAS_in,"$file_in" || die "can not read fasta file\n"; + my $seq_num = 0; + while (<FAS_in>){ + if( m/^\>/){ + $seq_num ++; + } + if( $seq_num>1){ + last; + } + } + + return $seq_num; + +} + + + +sub check_file_type { + my $file_in = shift @_; + my $out = 'no'; + + open FAS_in, "$file_in" || die "can not read fasta file\n"; + + while (my $line = <FAS_in>){ + if($line =~ m/\w/){ + if($line =~ m/^\>/){ + $out = 'fa'; + last; + } + if($line =~ m/^\@/){ + $out = 'fq'; + last; + } + } + } + + return $out; +} + + +sub rev_com { + my $seq = shift @_; + $seq = reverse($seq); + $seq =~ tr/ACGT/TGCA/; + return($seq); +} + +
