Mercurial > repos > lxue > ageseq

#!/usr/bin/perl
use strict;
my $file_design       =  $ARGV[0];
my $read_file         =  $ARGV[1];
my $mismatch_cutoff   =  $ARGV[2];   # mismatch rate to filter low quality alignment, default 0.1 (10 %)
my $min_cutoff        =  $ARGV[3];   # cutoff to filter reads with low abundance, default  0
my $wt_like_report    =  $ARGV[4];   # report top xx WT like records, default 20
my $indel_report      =  $ARGV[5];   # report top xx records with indel, default  50
my $final_out         =  $ARGV[6];

#my $blat = '';   # working directory or PATH
my $blat = '/usr/local/bin/blat';  # Your prefered full location

my $rand = rand 1000000;
## setting for reports
if(not defined ($mismatch_cutoff )){
    $mismatch_cutoff   = 0.1 ;    # mismatch rate to filter low quality alignment, default 0.1 (10 %)
}

if(not defined ($min_cutoff )){
	$min_cutoff   = 0 ;   # cutoff to filter reads with low abundance, default  0
}

if(not defined ($wt_like_report )){
	$wt_like_report    = 20 ; # report top xx WT like records, default 20
}

if(not defined ($indel_report )){
	 $indel_report      = 50   ;    # report top xx records with indel, default  50
}


my $remove_files      =  1  ;  # keep (0) or delete (1) intermediate files, default = 1

###############################################################
# default setting

my $remove = 'rm';
my $osname = $^O;

my  $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa';
my  @psl_file_data = ();

if(not defined ($file_design )){
	die "Design file is needed\n";
}

if(not defined ($final_out )){
	$final_out = 'AGE_output.txt';
}
##########################################################
###  step 1 load design file

open DESIGN,"<$file_design" or die "File  $file_design  not found error here\n";
open DESIGNFAS,">$design_fas" or die "can not wirte  $design_fas  not found error here\n";

my $num = 0;
my %design_hash = ();


while (my $line = <DESIGN>) {
	  $num ++;
	  if($num == 1){
	  	 next;
	  }
	  if($line !~ m/\w/){
	  	 next;
	  }

	  chomp  $line;
	  my @temp = split(/\t/,$line);
	  my $id  = $temp[0];
	  my $seq = $temp[1];
	  $seq =~ s/\s+//g;
	  $seq =~ m/([^\r\n]+)/;
	  $seq = $1;
	  $id  =~ s/\s+/\_/g;

	  $design_hash{$id} = $seq;
	  print DESIGNFAS ">$id\n$seq\n";
}
close(DESIGNFAS);


#######################################################
##############  step 2 - load read files  #############

my %total_read_num = ();
my $num_c          = 0;
my @fasta_files    = ();

my $file_in   = $read_file;
my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa';


my $type = check_file_type($file_in )   ;

if($type eq 'fq'){
	# fastq
  # print "this is fastq file\n";
    my $total_num  = &fastq2fasta($file_in,$fasta_out);
	$total_read_num{$fasta_out} = $total_num;
}
elsif($type eq 'fa'){
	 # fasta
	# print "this is fasta file\n";
	 my $total_num  =  &fasta2fasta($file_in,$fasta_out);
	         # load number
	 $total_read_num{$fasta_out} = $total_num;
}
else{
	die "Please check the format of read file\n";
}


########################################################
## Here will put step3 to step 5 into one loop


open (REPORT,">$final_out") || die "cannot write\n";


 my $file_blat_in =  $fasta_out;

     ############  step 3 - blat  ###########################
 my $blat_out        =  '/tmp/fasta'.$rand.'_blat_crispr.psl';

	 my $command_line =  $blat.' '.' -tileSize=7 -oneOff=1  -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '.  $design_fas.' '.$blat_out .' >/tmp/blat.txt';

	 system("$command_line");

	 #print "blat job $file_blat_in is done\n";
	 ##########  step 4 - convert psl -> bed  ##############

	 my $bed_hash_address = &psl2bed ($blat_out );	## address of one hash

	 ##########  step 5 - get sequences number from bed ##############

	 my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash


	 ## remove files

if($remove_files == 1){
	 	    $command_line = $remove.' '.$file_blat_in;
	        system("$command_line");

	        $command_line = $remove.' '.$blat_out ;
	        system("$command_line");

	       	 my $command_line2 = $remove.' '.$design_fas ;
	       system("$command_line2");
}


#########################################################################################
## functions


sub MAIN{
	   my $fas_file_in         = shift @_;
	   my $bed_file_address_in = shift @_;

	   my %bed_hash_in = %{$bed_file_address_in};


	   #########################################################
	   # read convereted reads fasta file

        open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in ";

        my %query_with_seq2num  = ();
        my %seq2alignment = ();
        my %seq_part2assignment = ();

		while (my $lineIn =<FAS>){
			 chomp $lineIn;
			 if($lineIn =~ m/^\>/){
					my $id = substr $lineIn,1;

					my @numbers = split(/\_/, $id);
					my $total_num = $numbers[-1];

					$lineIn = <FAS>;
					chomp $lineIn;
					my $seq = $lineIn;

					if(exists $bed_hash_in{$id}){
							  # $psl_hash{$read_target}{$query}  # format of hash
							  ## asign read to target reference

						  ##########################################################
						  my @read_asignment = ();
						  ## loop for best
						  for my $read_key (keys %{$bed_hash_in{$id}}){
						       	### get annotation from array
							    ## Q: design T: read
							    my @get_array = @{$bed_hash_in{$id}{$read_key}};
							    my ($strand, $q_name,$q_size,$q_start,$q_end,
							       	             $t_name,$t_size,$t_start,$t_end,
							       	             $block_num,$block_size,$q_start_s,$t_start_s) =  @get_array ;

							    my $seq_out = substr $seq,$t_start,($t_end-$t_start);

							    if($strand eq '-'){
									  $seq_out = &rev_com ($seq_out);
							    }
							    			 ## get orignal
									 my $ref_ori_seq = '';
									 if(exists $design_hash{$q_name}){
										 	 	   $ref_ori_seq = $design_hash{$q_name};
								     }


							        my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq);
							                                                     # target   # part of read
				                    my ($align_ref,$align_read,$note) = split(/\t/, $edit_note);

									my $len_ori = length($ref_ori_seq);
									my @align_1 = split(//, $align_ref);
								    my @align_2 = split(//, $align_read);
									my $site_snp = 0;
									my @site_snp = ();
									my $iden_num = 0;

									 for my $a1(@align_1){
									 	 my $a2 = shift @align_2;
									 	 if($a1 ne '-' and $a2 ne '-'){
									 	 	  if($a1 ne $a2){
									 	 	  	  push @site_snp, $site_snp;
									 	 	  }
									 	 }

									 	 if($a1 eq $a2){
									 	 	$iden_num++;
									 	 }

									 	 $site_snp++;
									 }
									 $iden_num = $iden_num-2;

									 ## filter on SNP number
									 my $snp_num_for_filter = @site_snp+0;

									 my $dis_max = 0;
									 my $dis_current = 0;
									 my @continuous_site_temp = ();
									 my @continuous_site = ();
									 my @site_snp_checking = @site_snp;
									 my $start = shift @site_snp_checking ;
									 for my $site_checking (@site_snp_checking){
									 	 my $dis_in = $site_checking -$start;
									 	 if($dis_in <4){
									 	 	 $dis_current++;
									 	 	 push  @continuous_site_temp,$site_checking ;
									 	 	 if($dis_current > $dis_max){
									 	 	 	 $dis_max = $dis_current;
									 	 	 	 @continuous_site = @continuous_site_temp;
									 	 	 }
									 	 }
									 	 else{
									 	 	 $dis_current = 0;
									 	 	 @continuous_site_temp = ();

									 	 }

									 	 $start = $site_checking;
									 }


				                     if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering
				                     	if($dis_max <5){
				                     		 next;
				                     	}
				                     }


				                     if($dis_max>=5){
				                     	  @site_snp = ();
				                          my @align_x = split(//, $align_read);
				                          for my $iii(@continuous_site ){
				                          	  $align_x[$iii] = 'N';
				                          }
				                          $align_read = join('',@align_x);
				                     	  $note = 'strange editing';
				                     	  my @ttt = ($align_ref,$align_read,$note) ;

				                     	  $edit_note = join("\t", @ttt );
				                     }


									 my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array);
									 push @read_asignment,[@ooo];
						   }## for loop for best hit of each fasta reads

						   ## get the best hits

						 if(@read_asignment<1){
						 	  next;
						 }

				        ##  assign the first hit
				        @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ;
				        my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]};

				        my @snp_filtering = @{$array_snp_addr};

				        if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){
				        	    next;
				        }


				        ## after asignment

				        ## check whether seq_out is assigned earlier
				        ## $seq_out has been asigned to one record
				        ## just assign reads to that one althogh it may be get one new assignment
						if( exists $seq_part2assignment{$seq_out}){
							   my $first_assignment = $seq_part2assignment{$seq_out};
							   $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num;
							   next;
						}


										 # number for part of reads
					    my ($strand, $q_name,$q_size,$q_start,$q_end,
							       	             $t_name,$t_size,$t_start,$t_end,
							       	             $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr};

					    # this the first asignment
						if(not exists $seq2alignment{$seq_out}){
								my @out = ($ref_ori_seq,$edit_note,$array_snp_addr);
								$seq2alignment{$seq_out} = \@out;

					    }
					    			# assigned
					    if(not exists $seq_part2assignment{$seq_out})	{
					    	  $seq_part2assignment{$seq_out} = $q_name;
					    	  $query_with_seq2num{$q_name}{$seq_out} = $total_num;
					    }


					}# exists  bed file
			}# lineIN
		}# while file
		close(FAS);

		### summary data for output
		my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment);

		### write output

		&write_buffer_out($hash_addr1, $hash_addr2);

} # Main function


sub get_out_buffer{
	 my $fas_file_in = $_[0];
	 my %query_with_seq2num_in =  %{$_[1]};
	 my %seq2alignment_in       = %{$_[2]};

	 my %buffer_out = ();
	 my %buffer_num = ();

	 my $total_non_redun = 0;
	 ## push output in buffer before printing
	  my $test_case_num = $total_read_num{$fas_file_in};

	 for my $ref_name (sort keys %query_with_seq2num_in){
		 my %in_hash = %{$query_with_seq2num_in{$ref_name}};

		 ## each group
		 my $report_wt_count = 0;
		 my $report_indel_count = 0;
         my $other_num = 0;
         my $sub_hit_num = 0;
         my $indel_hit_num = 0;


		 my @other_record = ();
		 ### loop for records
		 for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){

			 	my $hitnum = $in_hash{$part_of_read};

			 	 if($hitnum < $min_cutoff) {
			 	 	  next;
			 	 }

			 	 if($hitnum/$test_case_num < 0.00005) {
			 	 	  next;
			 	 }

			 	 ## get Editing Note for output only

			   if(exists $seq2alignment_in{$part_of_read}){
				  my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}};
			      my ($align_ref,$align_read,$note) = split(/\t/,$edit_note);
					  my @snp_out = @{$array_snp_addr};
					  my $snp_num = @snp_out +0;
					  # deep sequencing SNP
					  if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){
					  	  next;
					  }

					  my $snp_note = '';
					  if($snp_num>0 ){
				 	        $snp_note = $snp_num.' SNP('.join(',',@snp_out).')';
				      }

				      $sub_hit_num = $sub_hit_num+$hitnum;
			 	      $total_non_redun = $total_non_redun + $hitnum;


			 	      # output here
			 	      if($ref_ori_seq ne $design_hash{$ref_name} ){
			 	      	 print "error in read assingment \n";
			 	      }


			 	   my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note);
					 				 	 ## indel
			 	   if($edit_note !~ m/no.+indel/ ){

			 	   	   if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){
			 	   	   	   next;   # filter strange case in deep sequencing with few reads
			 	   	   }


			 	 	   $report_indel_count ++;
			 	 	   # total indel
			 	 	   $indel_hit_num = $indel_hit_num + $hitnum;

			 	 	   if($report_indel_count <= $indel_report ){
			 	 	   	    #print REPORT join("\t",@out_line),"\n";
			 	 	   	    ## in buffer
			 	 	   	    if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
			 	 	   	    	 my @bbb = ();
			 	 	   	    	 push @bbb, [@out_line];
			 	 	   	    	 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
			 	 	   	    }
			 	 	   	    else{
			 	 	   	    	 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
			 	 	   	    }
			 	 	   }
			 	 	   else{
			 	 	   	    $other_num = $other_num+$hitnum;
			 	 	        @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
			 	 	   }

			 	 }
			 	 else{
			 	 	  $report_wt_count ++;
			 	 	  if($report_wt_count <= $wt_like_report  ){
			 	           #print REPORT join("\t",@out_line),"\n";
			 	           # in buffer
			 	           if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
			 	 	   	    	 my @bbb = ();
			 	 	   	    	 push @bbb, [@out_line];
			 	 	   	    	 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
			 	 	   	    }
			 	 	   	    else{
			 	 	   	    	 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
			 	 	   	    }
			 	      }
			 	      else{
			 	 	       $other_num = $other_num+$hitnum;
			 	 	       @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
			 	      }

			 	 }# wt like report

			} # if exists

	    }# for part of reads

	    ## other report
		 if($other_num>0){
		 	  	 	  $other_record[-5]	 = $other_num;
		 	         #print REPORT join("\t",@other_record),"\n";

		 	         if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
			 	 	   	    	 my @bbb = ();
			 	 	   	    	 push @bbb, [@other_record];
			 	 	   	    	 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
			 	 	 }
			 	     else{
			 	 	   	    	 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record];
			 	 	 }
		 }

			#my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num);
				 ## total read number
		   my $total_num =  $total_read_num{$fas_file_in};

		                       #    0           1          2
		  # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
			 my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);

			$buffer_out{$fas_file_in}{$ref_name}{'sum'}   =   \@summary_record ;
			$buffer_num{$fas_file_in}{'total'} =   $total_num ;
			$buffer_num{$fas_file_in}{'sub'}   =   $total_non_redun ;
			#print REPORT join("\t",@summary_record),"\n";

	 }# for ref_seq


     return (\%buffer_out,\%buffer_num);
} # sub


### write the output

sub write_buffer_out{
	my %hash_out = %{$_[0]};
	my %hash_out_num = %{$_[1]};
	my $print_tracking = 0;
	for my $fas_file_in (sort keys %hash_out){

	     my  $total_num       = $hash_out_num{$fas_file_in}{'total'} ;
	     my  $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ;

		 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
			 if(not exists $hash_out{$fas_file_in}{$ref_name}{"data"}){
		 	 	# print "No data for $fas_file_in $ref_name \n";
		 	 	 next;
		 	 }

		 	 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};

		 	 for (@data){
		 	 	  $print_tracking++;
		 	 	  if($print_tracking==1){
		 	 	  	  print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t",
	                                "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n";
		 	 	  }
		 	 	  my @ooo = @{$_};
		 	 	  $ooo[0] = 'Input1';
		 	 	  print REPORT join("\t", @ooo),"\n";
		 	 }

			 # sum
			 my  @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;


			 my  @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun);

			 print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n";

		 }


		 ##### print summary region
	  print REPORT "Summary \t -----------------------------------------------------\n";
      print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n";


		 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){

		 	 if(not exists $hash_out{$fas_file_in}{$ref_name}{"data"}){
		 	 	# print "No data for $fas_file_in $ref_name \n";
		 	 	 next;
		 	 }
		 	 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};

		 	 my $wt_pair    = ''."\t".'';
		 	 my $indel_pair = ''."\t".'';
		 	 my $indel_out  = '';
		 	 my $i = 0;

		 	 for (@data){
		 	 	  $i++;
		 	 	  my @in= @{$_};


		 	 	  if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){
		 	 	  	    if($indel_pair eq ''."\t".''){
				 	 	  	    $indel_pair = $in[5]."\t".$in[6];
				 	 	  	    my @indel_temp = split(/\s+/,$in[7]);

				 	 	  	    my @ooo_indel = ();
				 	 	  	    for my $ooo_in (@indel_temp ){
				 	 	  	    	 if($ooo_in =~ m/I(\d+)/){
				 	 	  	   	         push @ooo_indel, '+'.$1;
				 	 	  	         }
				 	 	  	         if($ooo_in =~ m/D(\d+)/){
				 	 	  	   	         push @ooo_indel, '-'.$1;
				 	 	  	         }
				 	 	  	          if($ooo_in =~ m/strange/){
				 	 	  	   	         push @ooo_indel, 'strange editing';
				 	 	  	         }
				 	 	  	    }
				 	 	       $indel_out = 'Indel:'.join(',', @ooo_indel);
		 	 	  	    }
		 	 	  }
		 	 	  else{
		 	 	  	  # no indel
		 	 	  	  if($wt_pair eq ''."\t".''){
		 	 	  	  	      $wt_pair = $in[5]."\t".$in[6];
		 	 	  	  }
		 	 	  }
		 	 }

		 	 # first record


			 # sum
			 my  ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
			 my $sub_num ;
			 my $indel_num;
			 #print 		$sub_string. "sub summary\n";
			 #print 		$indel_string ."sub summary\n";


			 if($sub_string =~ m/\:[^\d]+(\d+)/){
			 	    $sub_num = $1;
			 }
			 if($indel_string =~ m/\:[^\d]+(\d+)/){
			 	    $indel_num = $1;
			 }

			 my $rate = int(($indel_num/$sub_num)*10000)/100;

			 ## indel
			                                       # two columns
			 my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out);
			 print REPORT 'Sum:',join("\t", @out_sum),"\n";
			 ## wt like
			 my $wt_num = $sub_num - $indel_num;
			 my $wt_rate = 100-$rate;
			 my @out_sum = ("Input1",$ref_name, $wt_pair  ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like");
			 print REPORT 'Sum:',join("\t", @out_sum),"\n";

		 } # records

		 print REPORT "\n";

	} # file
} # sub
###################################################################


sub get_editing_note{
	my ($address_in,$query_seq, $target_seq ,$full_read) = @_;
	# target: part of read
	# query : original sequence in design file

	my ($strand, $q_name,$q_size,$q_start,$q_end,
				$t_name,$t_size,$t_start,$t_end,
				$block_num,$block_size,$q_start_s,$t_start_s) =  @{$address_in};

    my @out = ();
	#print 		$strand ,"strand\n";
    if($strand eq '+'){
	     @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read);
    }
	else{
		 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read);
		 @out = get_reverse(@out);
	}

	$out[0]='B'.$out[0].'E';
	$out[1]='B'.$out[1].'E';
	if($out[2] !~ m/\w/){
		$out[2] = 'no_indel';
	}
    return join("\t",@out);
}


sub get_alignment{
	 my ($blocks ,  $blockLengths,   $qStarts ,   $tStarts,$query_seq, $target_seq) =  @_;
	 my @blockSizes = split(/\,/,$blockLengths);
     my @qArray = split(/\,/,$qStarts);
     my @tArray = split(/\,/,$tStarts);

  	# target: part of read
	# query : original sequence in design file

     # print join("\t",@_),"\n";

     ### before blocks
     my $before_q = substr $query_seq, 0,$qArray[0];
     my $before_t = substr $target_seq,0,$tArray[0];


     ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before");

     ### after blocks
     my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1];
     my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1];


     ($after_q,$after_t) =  treat_sequence($after_q,$after_t,"after") ;

     ### blocks
     my @med_q = ();
     my @med_t = ();

     my @indel_out = ();
     my $out = '';
      ## first block
          my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0];
     	  my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0];
     	  push @med_q,$med_q_seq;
     	  push @med_t,$med_t_seq;

     if($blocks>1){
     	  for (my $i= 0;$i<($blocks-1);$i++){
     	 	  ####### interval
     	 	  my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i]));
     	      my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i]));

     	      ### count deletion insertion
     	      if(length($inter_t_seq) - length($inter_q_seq)>0){
     	      	  # to fix the mismatch near deletion   length($inter_q_seq)
     	      	  $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq));
     	      }
     	      if(length($inter_q_seq)-length($inter_t_seq)>0){
     	      	  $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq));
     	      }
     	      push @indel_out,$out;

     	      ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ;
     	       push @med_q, $inter_q_seq;
     	       push @med_t, $inter_t_seq;

     	 	  ######  block   after interval
     	 	  $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1];
     	      $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1];
     	      push @med_q,$med_q_seq;
     	      push @med_t,$med_t_seq;
     	 }
     }

    	      push @med_q,$after_q;
     	      push @med_t,$after_t;

     	     unshift  @med_q,$before_q;
     	     unshift  @med_t,$before_t;
     	     my $q_string = join('',@med_q);
     	     my $t_string = join('',@med_t);

     	    if($q_string=~m/^(\-+)/){
     	    	my $len = length($1);
     	    	$q_string = substr $q_string,$len;
     	        $t_string = substr $t_string, $len;
     	    }
     	    if($q_string=~m/(\-+)$/){
     	    	my $len = length($1);
     	    	$q_string = substr $q_string,0,(-1)*$len;
     	        $t_string = substr $t_string,0, (-1)*$len;
     	    }


     	    return ($q_string,$t_string,join(' ',@indel_out));

}


sub get_reverse{
		 my ($q_string,$t_string,$indel_string )= @_;
		 $q_string = rev_com($q_string);
		 $t_string = rev_com($t_string);

		 my $string_test = $q_string;
		 $string_test =~ s/\-//g;
		 my $len = length($string_test);

		 my @indel_in =  split(/\s/, $indel_string);
		 my @indel_out = ();
		 for (@indel_in){

		 	  if(m/^(\d+)(.)(\d+)/){
		 	  	my $ooo = ($len-$1-$3).$2.$3;
		 	  	if($2 eq 'I'){
		 	  		 $ooo = ($len-$1).$2.$3;
		 	  	}

		 	  	  unshift @indel_out,$ooo;
		 	  }
		 }
		 $indel_string = join(' ',@indel_out);
		 return($q_string,$t_string, $indel_string);
}


sub treat_inter{
    my ($q,$t) = @_;
	my $q_len = length($q);
	my $t_len = length($t);
	my $dis = abs($q_len-$t_len);
    my @oooo = ();

	for(1..$dis){
			push @oooo,'-';
	}

    if($q_len-$t_len >0){
    	$t = $t.join('',@oooo);
    }
    else{
    	$q = $q.join('',@oooo);
    }
	return ($q,$t);
}


sub treat_sequence{
	my ($q,$t,$position) = @_;
	my $q_len= length($q);
	my $t_len = length($t);
	my $dis = abs($q_len-$t_len);

	if($dis > 0){
		my @oooo = ();
		for(1..$dis){
			push @oooo,'-';
		}
		if($q_len>$t_len){
			if($position eq 'before'){
				 $t = join('',@oooo).$t ;
			}
			else{
				 $t = $t.join('',@oooo) ;
			}
		}
		else{ # q small
		    if($position eq 'before'){
				 $q = join('',@oooo).$q ;
			}
			else{
				 $q = $q.join('',@oooo) ;
			}
		}
	}
	return ($q,$t);
}


sub psl2bed  {
    my $psl_file = shift @_; # file in @psl_file_data
    my @psl_array = ();


    open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n";

    ## should be correct

	for(1..5){
		     my $line=<FILEINBLAT>;
		     ## remove head
	}
	## read psl
	while (<FILEINBLAT>)
    {
		my $lineIn = $_;
		chomp $lineIn;
		my @lineArr  = split ("\t", $lineIn);

		my $q_size   = $lineArr[10];
		my $m_size   = $lineArr[0];

		if( $m_size/$q_size > 0.20){
			 push @psl_array,[@lineArr];
		}

    }# while file
    close(FILEINBLAT);
    ## score large to small

    #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ;

    my @out_array = ();
    my $psl_line_num = 0;
    my %psl_hash = ();
	for  (@psl_array)
	{
			my @lineArr  = @{$_};
			$psl_line_num++;
			my $read_target  = $lineArr[13];
			my $query        = $lineArr[9];
			for(1..8){
				 shift  @lineArr;
			}

			if(not exists $psl_hash{$read_target}{$query}){		## all alignment
			#if(not exists $psl_hash{$read_target}){  ## only keep the best hit
				 $psl_hash{$read_target}{$query} = \@lineArr;
			}
	}# while array
	#print $psl_line_num." total psl lines\n";
	return (\%psl_hash);
}


sub fasta2fasta {
    my $input_file = shift @_;
    my $out_file = shift @_;

    open IN,"<$input_file " or die "File $input_file  not found error here\n";
    open TGT,">$out_file" or die "File $out_file not found error here\n";
    my $c = 0;
    my @line = ();
    my $id ='';
    my $seq = '';
    my $processed = 0;

    while(<IN>){
        chomp;
        if(m/^\>/){
        	$processed ++;
        	if($processed == 1){
        		my @seq_id = split(/\s+/, $_);
        		my $seq_id = $seq_id[0];
        		 $seq_id =~ s/\_//g;
        		 $seq_id = $seq_id.'_1';
        		 print TGT '>'.$seq_id,"\n";
        	}
        	else{
        		$seq =~ s/\s|\n|\r//g;
        		print TGT $seq,"\n";
        		$seq = '';
     		    my @seq_id = split(/\s+/, $_);
        		my $seq_id = $seq_id[0];
        		 $seq_id =~ s/\_//g;
        		 $seq_id = $seq_id.'_1';
        		 print TGT '>'.$seq_id,"\n";
        	}

        }
        else{
        	$seq = $seq.$_;
        }
    }
    # last records
    $seq =~ s/\s|\n|\r//g;
    print TGT $seq,"\n";

    close IN;
    close TGT;
    return $processed;
}


sub fastq2fasta {
    my $input_file = shift @_;
    my $out_file = shift @_;

    open IN,"<$input_file " or die "File $input_file  not found error here\n";
    open TGT,">$out_file" or die "File $out_file not found error here\n";
    my $c = 0;
    my @line = ();
    my $id ='';
    my $seq = '';
    my $processed = 0;
    my %read_hash = ();

    ##############################################################################################

    while(<IN>){
        chomp;
        $c++;
        if($c == 1){
            $processed++;
            #        print STDERR "$processed reads processed\r";
            @line = split();
            $id = $line[0];
            $id =~ s/\@//;
        }elsif($c == 2){
            $seq = $_;
        }elsif($c == 4){
            $c = 0;
            #print TGT ">$id\n$seq\n";
           # print TGT ">seq_$processed\n$seq\n";
             $read_hash{$seq}{$id} = 1;
            $id ='';
            @line =();
            $seq ='';
        }else{}
    }

    ## write TGT
    my $count = 0;
    for my $seq_out (sort  keys %read_hash){
    	$count++;
    	my @hits = keys %{$read_hash{$seq_out}};
    	my $num = @hits+0;
    	my $id_out = "R".$count.'_'.$num;
    	print TGT ">$id_out\n", $seq_out ,"\n";
    }

    close IN;
    close TGT;
    return $processed;
}


sub check_read_num{
	     my $file_in = shift @_;
	     # check read number

        open FAS_in,"$file_in" || die "can not read fasta file\n";
        my $seq_num = 0;
         while (<FAS_in>){
	         	if( m/^\>/){
	         		 $seq_num ++;
	         	}
	         	if( $seq_num>1){
	         		 last;
	         	}
         }

         return $seq_num;

}


sub check_file_type {
	 my $file_in = shift @_;
	 my $out = 'no';

	 open FAS_in, "$file_in" || die "can not read fasta file\n";

     while  (my $line = <FAS_in>){
     	 if($line =~ m/\w/){
     	 	if($line =~ m/^\>/){
     	 		$out = 'fa';
     	 		last;
     	 	}
     	 	if($line =~ m/^\@/){
     	 		$out = 'fq';
     	 		last;
     	 	}
     	 }
     }

     return $out;
}


sub rev_com {
   my $seq = shift @_;
   $seq = reverse($seq);
   $seq =~ tr/ACGT/TGCA/;
   return($seq);
}
author	lxue
date	Tue, 09 Jun 2015 12:44:58 -0400
parents	a9c5e846dd76
children