| 
0
 | 
     1 package align;
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use File::Basename;
 | 
| 
 | 
     6 use String::Random;
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 use FindBin;
 | 
| 
 | 
     9 use lib $FindBin::Bin;
 | 
| 
 | 
    10 use Rcall qw ( histogram );
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 use Exporter;
 | 
| 
 | 
    13 our @ISA     = qw( Exporter );
 | 
| 
 | 
    14 our @EXPORT  = qw( &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 sub to_build
 | 
| 
 | 
    17 {
 | 
| 
24
 | 
    18   my ( $toBuildTabP, $log, $newdir ) = @_;
 | 
| 
0
 | 
    19 
 | 
| 
24
 | 
    20   foreach my  $pairs ( @{ $toBuildTabP } ) 
 | 
| 
 | 
    21   {
 | 
| 
 | 
    22     if (  $pairs->[0] == 1 ) 
 | 
| 
 | 
    23     {   
 | 
| 
 | 
    24       my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa';
 | 
| 
 | 
    25       symlink( ${$pairs->[1]}, $sym );
 | 
| 
 | 
    26       ${$pairs->[1]} = $sym;
 | 
| 
 | 
    27       build_index ( ${$pairs->[1]}, $log );
 | 
| 
 | 
    28     }   
 | 
| 
 | 
    29   }
 | 
| 
0
 | 
    30 }
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 sub build_index
 | 
| 
 | 
    33 {
 | 
| 
 | 
    34 	my $to_index = shift;
 | 
| 
 | 
    35 	my $log = shift;
 | 
| 
 | 
    36 	my $index_log = $to_index.'_index.err';
 | 
| 
 | 
    37 	`bwa index $to_index 2> $index_log`;
 | 
| 
 | 
    38 	print $log "Creating index for $to_index\n";
 | 
| 
 | 
    39 }
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 sub get_unique
 | 
| 
 | 
    42 {
 | 
| 
44
 | 
    43 	my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;
 | 
| 
0
 | 
    44 
 | 
| 
52
 | 
    45 	my $fout = $col_prefix.'_all_mappers.fastq'; 
 | 
| 
 | 
    46 	my $funi = $col_prefix.'_unique_mappers.fastq';
 | 
| 
 | 
    47 	my $frej = $col_prefix.'_unmapped.fastq';
 | 
| 
0
 | 
    48 	
 | 
| 
46
 | 
    49   my $repartition = $out_prefix.'distribution.txt';
 | 
| 
 | 
    50 	my $png_rep = $out_prefix.'distribution.png';
 | 
| 
0
 | 
    51 	my ( %duplicates, %genome_hits) ;
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 	#alignement to the first reference
 | 
| 
 | 
    54 	my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
 | 
| 
 | 
    55 	my $ref_fai = $return[4];
 | 
| 
 | 
    56 	my $mappers =  $return[5];
 | 
| 
 | 
    57 	my $mappers_uni = $return[6];
 | 
| 
 | 
    58 	my $size_mappedHashR = $return[7];
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 	if ( $details == 1 )
 | 
| 
 | 
    61 	{
 | 
| 
 | 
    62 		#print number of duplicates and hits number
 | 
| 
 | 
    63 		my ($pourcentage, $total) =(0,0);
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 		$total += $_ foreach values %{$size_mappedHashR};
 | 
| 
 | 
    66 		open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
 | 
| 
 | 
    67 		print $rep "size\tnumber\tpercentage\n";
 | 
| 
 | 
    68 		foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
 | 
| 
 | 
    69 		{
 | 
| 
 | 
    70 			$pourcentage = 0;
 | 
| 
 | 
    71 			$pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 			print $rep "$k\t$size_mappedHashR->{$k}\t";
 | 
| 
 | 
    74 			printf $rep "%.2f\n",$pourcentage;
 | 
| 
 | 
    75 		}
 | 
| 
 | 
    76 
 | 
| 
 | 
    77 		histogram($size_mappedHashR, $png_rep, $total);
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 
 | 
| 
46
 | 
    80 		my $dup = $out_prefix.'dup_mapnum.txt';
 | 
| 
 | 
    81 		my $dup_u = $out_prefix .'dup_unique.txt';
 | 
| 
 | 
    82 		my $dup_r = $out_prefix .'dup_nonmapp.txt';
 | 
| 
0
 | 
    83 		open(my $tab,">".$dup) || die "cannot open output txt file\n";
 | 
| 
 | 
    84 		open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
 | 
| 
 | 
    85 		open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
 | 
| 
 | 
    86 		print $tab "sequence\tcount\tmapnum\n";
 | 
| 
 | 
    87 		print $tab_u "sequence\tcount\n";
 | 
| 
 | 
    88 		print $tab_r "sequence\tcount\n";
 | 
| 
 | 
    89 		foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
 | 
| 
 | 
    90 		{
 | 
| 
 | 
    91 			$duplicates{$k} = 0 unless exists($duplicates{$k});
 | 
| 
 | 
    92 			$genome_hits{$k} = 0 unless exists($genome_hits{$k});
 | 
| 
 | 
    93 			if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
 | 
| 
 | 
    94 			else {print $tab_r $k."\t".$duplicates{$k}."\n";}
 | 
| 
 | 
    95 			if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
 | 
| 
 | 
    96 		}
 | 
| 
 | 
    97 	close $dup; close $dup_r; close $dup_u;
 | 
| 
 | 
    98 	}
 | 
| 
 | 
    99 	return ( $ref_fai, $mappers, $mappers_uni );
 | 
| 
 | 
   100 }
 | 
| 
 | 
   101 
 | 
| 
 | 
   102 sub sam_parse
 | 
| 
 | 
   103 {
 | 
| 
 | 
   104 	my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
 | 
| 
 | 
   105 	my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
 | 
| 
 | 
   106 	$mappers = $mappersUnique = $reads = 0;
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 	open my $fic, '<', $sam || die "cannot open $sam $!\n";
 | 
| 
 | 
   109 	open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
 | 
| 
 | 
   110 	open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
 | 
| 
 | 
   111 	open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
 | 
| 
 | 
   112 	open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
 | 
| 
 | 
   113 	my $sequence = '';
 | 
| 
 | 
   114 	while(<$fic>)
 | 
| 
 | 
   115 	{
 | 
| 
 | 
   116 		chomp $_;
 | 
| 
 | 
   117 		if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 | 
| 
 | 
   118 		{
 | 
| 
 | 
   119 			if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 | 
| 
 | 
   120 			{
 | 
| 
 | 
   121 				$size{$1} = $2;
 | 
| 
 | 
   122 				$unique_number{$1} = 0;
 | 
| 
 | 
   123 				$number{$1} = 0;
 | 
| 
 | 
   124 				$numberNM{$1} = 0;
 | 
| 
 | 
   125 				$numberM{$1} = 0;
 | 
| 
 | 
   126 			}
 | 
| 
 | 
   127 			print $sam_uni $_."\n";
 | 
| 
 | 
   128 			next;
 | 
| 
 | 
   129 		}
 | 
| 
 | 
   130 		$reads++;
 | 
| 
 | 
   131 		my @line = split (/\t/,$_);
 | 
| 
 | 
   132 		$sequence = $line[9];
 | 
| 
 | 
   133 		if ($line[1] & 16)
 | 
| 
 | 
   134 		{
 | 
| 
 | 
   135 			$sequence =reverse($sequence);
 | 
| 
 | 
   136 			$sequence =~ tr/atgcuATGCU/tacgaTACGA/;
 | 
| 
 | 
   137 		}
 | 
| 
 | 
   138 		if ($line[1] == 16 || $line[1] == 0)
 | 
| 
 | 
   139 		{
 | 
| 
 | 
   140 			my $len = length($sequence);
 | 
| 
 | 
   141 			$size_num{$len} ++;
 | 
| 
 | 
   142 			$size_num_spe{$line[2]}{$len}++;
 | 
| 
 | 
   143 			$mappers ++;
 | 
| 
 | 
   144 
 | 
| 
 | 
   145 			${$best_hit_number_hashR}{$sequence} = $1  if  ($line[13] =~ /X0:i:(\d*)/ ||  $line[14] =~/X0:i:(\d*)/ );
 | 
| 
 | 
   146 			${$duplicate_hashR}{$sequence}++;
 | 
| 
 | 
   147 			$number{$line[2]}++;
 | 
| 
 | 
   148 			$numberSens{$line[2]}++ if $line[1] == 0 ;
 | 
| 
 | 
   149 			$numberReverse{$line[2]}++ if $line[1] == 16 ;
 | 
| 
 | 
   150 			print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 			if ($line[11] eq "XT:A:U")
 | 
| 
 | 
   153 			{
 | 
| 
 | 
   154 				$unique_number{$line[2]}++;
 | 
| 
 | 
   155 				$mappersUnique++;
 | 
| 
 | 
   156 				print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
 | 
| 
 | 
   157 				print $sam_uni $_."\n";
 | 
| 
 | 
   158 			}
 | 
| 
 | 
   159 			if ($_ =~ /.*XM:i:(\d+).*/)
 | 
| 
 | 
   160 			{
 | 
| 
 | 
   161 				if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
 | 
| 
 | 
   162 			}
 | 
| 
 | 
   163 		}
 | 
| 
 | 
   164 		else
 | 
| 
 | 
   165 		{
 | 
| 
 | 
   166 			${$best_hit_number_hashR}{$sequence} = 0;
 | 
| 
 | 
   167 			${$duplicate_hashR}{$sequence}++;
 | 
| 
 | 
   168 			print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
 | 
| 
 | 
   169 		}
 | 
| 
 | 
   170 	}
 | 
| 
 | 
   171 	close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 	print $report "Parsing $sam file\n";
 | 
| 
 | 
   174   print $report "\treads: $reads\n";	
 | 
| 
 | 
   175 	print $report "\tmappers: $mappers\n";
 | 
| 
 | 
   176 	print $report "\tunique mappers: $mappersUnique\n";
 | 
| 
 | 
   177 	print $report "-----------------------------\n";
 | 
| 
 | 
   178 	return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
 | 
| 
 | 
   179 }
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 sub get_hash_alignment
 | 
| 
 | 
   182 {
 | 
| 
 | 
   183   my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ;
 | 
| 
 | 
   184   my ($reads, $mappers, $unmapped) = (0,0,0);
 | 
| 
 | 
   185   my $accep_unique;
 | 
| 
 | 
   186   BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report );
 | 
| 
 | 
   187   
 | 
| 
 | 
   188   open my $fic, '<', $sam || die "cannot open $sam $!\n";
 | 
| 
 | 
   189   open my $accepted, '>', $outA || die "cannot open $outA\n"  if $accept == 1;
 | 
| 
 | 
   190   open my $rejected, '>', $outR || die "cannot open $outR\n"  if $reject == 1; 
 | 
| 
 | 
   191   open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f;
 | 
| 
 | 
   192   #if ($name eq "snRNAs") {
 | 
| 
 | 
   193   #  open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
 | 
| 
 | 
   194   #} 
 | 
| 
 | 
   195   my $sequence = '';
 | 
| 
 | 
   196   while(<$fic>)
 | 
| 
 | 
   197   {
 | 
| 
 | 
   198     chomp $_;
 | 
| 
 | 
   199     if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 | 
| 
 | 
   200     {
 | 
| 
 | 
   201     	if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 | 
| 
 | 
   202       {
 | 
| 
 | 
   203         print $fai $1."\t".$2."\n";
 | 
| 
 | 
   204       }
 | 
| 
 | 
   205       next;
 | 
| 
 | 
   206     }
 | 
| 
 | 
   207     $reads++;
 | 
| 
 | 
   208     my @line = split (/\t/,$_);
 | 
| 
 | 
   209     $sequence = $line[9];
 | 
| 
 | 
   210     if ($line[1] & 16)
 | 
| 
 | 
   211     {
 | 
| 
 | 
   212       $sequence =reverse($sequence);
 | 
| 
 | 
   213       $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
 | 
| 
 | 
   214     }
 | 
| 
 | 
   215     if ($line[1] & 16 || $line[1] == 0)
 | 
| 
 | 
   216     {
 | 
| 
 | 
   217       $mappers ++;
 | 
| 
 | 
   218       if ($accept == 1 )
 | 
| 
 | 
   219       {
 | 
| 
 | 
   220         print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
 | 
| 
 | 
   221    #     print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); 
 | 
| 
 | 
   222       }
 | 
| 
 | 
   223     }
 | 
| 
 | 
   224     else
 | 
| 
 | 
   225     {
 | 
| 
 | 
   226       print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1;
 | 
| 
 | 
   227       $unmapped++;
 | 
| 
 | 
   228     }
 | 
| 
 | 
   229   }
 | 
| 
 | 
   230  # close $accep_unique if ($name eq "bonafide_reads");
 | 
| 
 | 
   231   close $fic;
 | 
| 
 | 
   232   close $accepted  if $accept == 1;
 | 
| 
 | 
   233   close $rejected if $reject ==1;
 | 
| 
 | 
   234   close $fai if $fai_f;
 | 
| 
 | 
   235   print $report "\treads: $reads\n";
 | 
| 
 | 
   236   print $report "\tmappers: $mappers\n";
 | 
| 
 | 
   237   print $report "\tunmapped: $unmapped\n";
 | 
| 
 | 
   238   print $report "-----------------------------\n";
 | 
| 
 | 
   239   return ($mappers, $unmapped);
 | 
| 
 | 
   240 }
 | 
| 
 | 
   241 
 | 
| 
 | 
   242 sub sam_count
 | 
| 
 | 
   243 {
 | 
| 
 | 
   244   my $sam = shift;
 | 
| 
 | 
   245   my ( %number, %size );
 | 
| 
 | 
   246   
 | 
| 
 | 
   247   open  my $fic, '<', $sam || die "cannot open $sam file $!\n"; 
 | 
| 
 | 
   248   while(<$fic>)
 | 
| 
 | 
   249   {
 | 
| 
 | 
   250     chomp $_;
 | 
| 
 | 
   251     if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 | 
| 
 | 
   252     {
 | 
| 
 | 
   253       if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 | 
| 
 | 
   254       {
 | 
| 
 | 
   255         $size{$1} = $2;
 | 
| 
 | 
   256         $number{$1} = 0;
 | 
| 
 | 
   257       }      
 | 
| 
 | 
   258     }
 | 
| 
 | 
   259     else
 | 
| 
 | 
   260     {
 | 
| 
 | 
   261   		my @line = split (/\t/,$_);
 | 
| 
 | 
   262  	  	if ( $line[1]  & 16 || $line[1] == 0 )
 | 
| 
 | 
   263  	  	{
 | 
| 
 | 
   264  	  		$number{$line[2]}++;
 | 
| 
 | 
   265  	  	}
 | 
| 
 | 
   266  	 	}
 | 
| 
 | 
   267 	}
 | 
| 
 | 
   268   close $fic;
 | 
| 
 | 
   269   return ( \%number, \%size );
 | 
| 
 | 
   270 }
 | 
| 
 | 
   271 
 | 
| 
 | 
   272 sub sam_count_mis 
 | 
| 
 | 
   273 {
 | 
| 
 | 
   274 
 | 
| 
 | 
   275 	my $sam = shift;
 | 
| 
 | 
   276 	my ( %number, %numberNM, %numberM, %size);
 | 
| 
 | 
   277 	
 | 
| 
 | 
   278 	open  my $fic, '<', $sam || die "cannot open $sam file $!\n";
 | 
| 
 | 
   279   while(<$fic>)
 | 
| 
 | 
   280   {
 | 
| 
 | 
   281 		chomp $_;
 | 
| 
 | 
   282     if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 | 
| 
 | 
   283     {
 | 
| 
 | 
   284       if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 | 
| 
 | 
   285       {
 | 
| 
 | 
   286 				$size{$1} = $2;
 | 
| 
 | 
   287 				$number{$1} = 0;
 | 
| 
 | 
   288 				$numberNM{$1} = 0;
 | 
| 
 | 
   289 				$numberM{$1} = 0;
 | 
| 
 | 
   290 			}
 | 
| 
 | 
   291 		}
 | 
| 
 | 
   292 		else
 | 
| 
 | 
   293 		{
 | 
| 
 | 
   294 			my @line = split (/\t/,$_);
 | 
| 
 | 
   295 			if ( $line[1]  & 16 || $line[1] == 0 )
 | 
| 
 | 
   296 			{
 | 
| 
 | 
   297 				$number{ $line[2] }++;
 | 
| 
 | 
   298     	 	if ($_ =~ /.*XM:i:(\d+).*/)
 | 
| 
 | 
   299     	 	{
 | 
| 
 | 
   300         	if ( $1 == 0 ){ $numberNM{$line[2]}++; } else { $numberM{$line[2]}++; } 
 | 
| 
 | 
   301       	}
 | 
| 
 | 
   302 			}
 | 
| 
 | 
   303 		}
 | 
| 
 | 
   304 	}
 | 
| 
 | 
   305 	return (\%number, \%size, \%numberNM, \%numberM );
 | 
| 
 | 
   306 }
 | 
| 
 | 
   307 
 | 
| 
 | 
   308 sub sam_to_bam_bg
 | 
| 
 | 
   309 {
 | 
| 
 | 
   310 	my ( $sam, $scale, $number_of_cpus ) = @_;
 | 
| 
19
 | 
   311 	my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
 | 
| 
0
 | 
   312 	if ( $sam =~ /(.*?).sam$/ )
 | 
| 
 | 
   313 	{
 | 
| 
 | 
   314 		$bam_sorted = $1.'_sorted.bam';
 | 
| 
 | 
   315 		$bedgraphP= $1.'_plus.bedgraph';
 | 
| 
 | 
   316 		$bedgraphM = $1.'_minus.bedgraph';
 | 
| 
19
 | 
   317 		$view_err = $1.'_view.err';
 | 
| 
 | 
   318 		$sort_err = $1.'_sort.err';
 | 
| 
0
 | 
   319 	}	
 | 
| 
19
 | 
   320 	`samtools view -Shb   --threads $number_of_cpus $sam 2> $view_err | samtools sort  -O BAM --threads $number_of_cpus  /dev/stdin 2> $sort_err  > $bam_sorted`;
 | 
| 
10
 | 
   321 	`bedtools genomecov -scale $scale -strand + -bga -ibam $bam_sorted > $bedgraphP`;	
 | 
| 
 | 
   322 	`bedtools genomecov -scale $scale -strand - -bga -ibam $bam_sorted > $bedgraphM`;
 | 
| 
0
 | 
   323 }
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 sub sam_sorted_bam
 | 
| 
 | 
   326 {
 | 
| 
 | 
   327 	my ( $sam, $number_of_cpus ) = @_;
 | 
| 
19
 | 
   328 	my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
 | 
| 
0
 | 
   329 	if ( $sam =~ /(.*?).sam$/ )
 | 
| 
 | 
   330 	{
 | 
| 
 | 
   331 		$bam_sorted = $1.'_sorted.bam';
 | 
| 
19
 | 
   332 		$view_err = $1.'_view.err';
 | 
| 
 | 
   333 		$sort_err = $1.'_sort.err';
 | 
| 
 | 
   334 
 | 
| 
0
 | 
   335 	}
 | 
| 
19
 | 
   336 	`samtools view -Shb   --threads $number_of_cpus $sam 2> $view_err | samtools sort  -O BAM --threads $number_of_cpus  /dev/stdin  2> $sort_err  > $bam_sorted`;
 | 
| 
0
 | 
   337 }
 | 
| 
 | 
   338 
 | 
| 
 | 
   339 sub BWA_call
 | 
| 
 | 
   340 {
 | 
| 
 | 
   341 	my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
 | 
| 
 | 
   342 	my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
 | 
| 
 | 
   343 	print $report "-----------------------------\n";
 | 
| 
 | 
   344 	print $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\n";
 | 
| 
 | 
   345 	`bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `;
 | 
| 
 | 
   346 }
 | 
| 
 | 
   347 
 | 
| 
 | 
   348 sub rpms_rpkm
 | 
| 
 | 
   349 {
 | 
| 
 | 
   350   my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
 | 
| 
 | 
   351   open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
 | 
| 
 | 
   352   print $out "ID\treads counts\tRPKM";
 | 
| 
 | 
   353   print $out "\tper million of piRNAs" if ($piRNA_number != 0);
 | 
| 
 | 
   354   print $out "\tper million of miRNAs" if ($miRNA_number != 0);
 | 
| 
 | 
   355   print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
 | 
| 
 | 
   356   print $out "\n";
 | 
| 
 | 
   357   foreach my $k  ( sort keys %{$counthashR} )
 | 
| 
 | 
   358   {
 | 
| 
 | 
   359     my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
 | 
| 
 | 
   360     
 | 
| 
 | 
   361     $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
 | 
| 
 | 
   362     print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
 | 
| 
 | 
   363     
 | 
| 
 | 
   364     if ($piRNA_number != 0 )
 | 
| 
 | 
   365     {
 | 
| 
 | 
   366       $pirna = ( $counthashR->{$k}  * 1000000) / $piRNA_number;
 | 
| 
 | 
   367       printf $out "\t%.2f",$pirna;
 | 
| 
 | 
   368     }
 | 
| 
 | 
   369     if ($miRNA_number != 0 )
 | 
| 
 | 
   370     {
 | 
| 
 | 
   371       $mirna = ( $counthashR->{$k}  * 1000000) / $miRNA_number;
 | 
| 
 | 
   372       printf $out "\t%.2f",$mirna;
 | 
| 
 | 
   373     }
 | 
| 
 | 
   374     if ($bonafide_number != 0 )
 | 
| 
 | 
   375     {
 | 
| 
 | 
   376       $bonafide = ( $counthashR->{$k}  * 1000000) / $bonafide_number;
 | 
| 
 | 
   377       printf $out "\t%.2f",$bonafide;
 | 
| 
 | 
   378     }
 | 
| 
 | 
   379     print $out "\n";
 | 
| 
 | 
   380  	}
 | 
| 
 | 
   381   close $out;
 | 
| 
 | 
   382 }
 | 
| 
 | 
   383 
 | 
| 
 | 
   384 sub extract_sam
 | 
| 
 | 
   385 {
 | 
| 
 | 
   386   my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
 | 
| 
 | 
   387   
 | 
| 
 | 
   388 	open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
 | 
| 
 | 
   389 
 | 
| 
 | 
   390   open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; 
 | 
| 
 | 
   391   open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; 
 | 
| 
 | 
   392   
 | 
| 
 | 
   393   open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
 | 
| 
 | 
   394   open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
 | 
| 
 | 
   395 
 | 
| 
 | 
   396   my $sequence = '';
 | 
| 
 | 
   397   while(<$s_in>)
 | 
| 
 | 
   398   {
 | 
| 
 | 
   399     if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 | 
| 
 | 
   400     {
 | 
| 
 | 
   401       print $s_out $_ if defined ($hashRef);
 | 
| 
 | 
   402       print $s_uni_out $_;
 | 
| 
 | 
   403       next;
 | 
| 
 | 
   404     }
 | 
| 
 | 
   405     my @line = split (/\t/,$_);
 | 
| 
 | 
   406     $sequence = $line[0];
 | 
| 
 | 
   407     if ( (! defined ($hashRef) )|| (  exists $hashRef->{$sequence}  &&  $hashRef->{$sequence} == 1 ) )
 | 
| 
 | 
   408     {
 | 
| 
 | 
   409       my $arn  =  $line[9];
 | 
| 
 | 
   410       if ($line[1] & 16)
 | 
| 
 | 
   411       {
 | 
| 
 | 
   412         $arn =reverse($arn);
 | 
| 
 | 
   413         $arn =~ tr/atgcuATGCU/tacgaTACGA/;
 | 
| 
 | 
   414       }
 | 
| 
19
 | 
   415 
 | 
| 
0
 | 
   416       if  ( ( $line[1] == 16 || $line[1] == 0 ) )
 | 
| 
 | 
   417 			{
 | 
| 
 | 
   418       	print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
 | 
| 
 | 
   419       	print $s_out $_ if defined ($hashRef);
 | 
| 
 | 
   420       	if ( $line[11] eq "XT:A:U" )
 | 
| 
 | 
   421       	{
 | 
| 
 | 
   422       		print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
 | 
| 
 | 
   423       		print $s_uni_out $_ ;
 | 
| 
 | 
   424       	}
 | 
| 
 | 
   425       }
 | 
| 
 | 
   426     } 
 | 
| 
 | 
   427   }
 | 
| 
 | 
   428   close $s_in; close $s_out if defined ($hashRef); 
 | 
| 
 | 
   429   close $s_uni_out; close $f_out; close $f_uni_out;
 | 
| 
 | 
   430 }
 | 
| 
 | 
   431 
 | 
| 
 | 
   432 sub get_fastq_seq{
 | 
| 
 | 
   433   my $fastq = shift;
 | 
| 
 | 
   434   my %hash; my $cmp = 0; 
 | 
| 
 | 
   435 
 | 
| 
 | 
   436   open my $fic, '<', $fastq || die "cannot open input file $! \n";
 | 
| 
 | 
   437   while(<$fic>)
 | 
| 
 | 
   438   {
 | 
| 
 | 
   439     chomp $_;
 | 
| 
 | 
   440     $cmp++;
 | 
| 
 | 
   441     if ($cmp % 4 == 1)
 | 
| 
 | 
   442     {
 | 
| 
 | 
   443       die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
 | 
| 
 | 
   444       if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;}
 | 
| 
 | 
   445       elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;}
 | 
| 
 | 
   446     }
 | 
| 
 | 
   447     elsif ($cmp % 4 == 3 )
 | 
| 
 | 
   448     {
 | 
| 
 | 
   449       die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
 | 
| 
 | 
   450     }
 | 
| 
 | 
   451   }
 | 
| 
 | 
   452   close $fic;
 | 
| 
 | 
   453   return \%hash;
 | 
| 
 | 
   454 }
 | 
| 
 | 
   455 
 | 
| 
 | 
   456 1;
 |