srnapipe: bin/align.pm comparison

comparison bin/align.pm @ 61:9185ca0a7b43 draft

Updated package according to recommendations.

author	pierre.pouchin
date	Wed, 16 Jan 2019 08:18:13 -0500
parents	9645d995fb3c
children

comparison

equal deleted inserted replaced

-:9645d995fb3c
+:9185ca0a7b43
 use FindBin;
 use lib $FindBin::Bin;
 use Rcall qw ( histogram );
 use Exporter;
-our @ISA     = qw( Exporter );
+our @ISA         = qw( Exporter );
-our @EXPORT  = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
+our @EXPORT    = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
 sub to_build
 {
 my ( $toBuildTabP, $log, $newdir ) = @_;
-foreach my  $pairs ( @{ $toBuildTabP } )
+foreach my    $pairs ( @{ $toBuildTabP } )
 {
-if (  $pairs->[0] == 1 )
+if (    $pairs->[0] == 1 )
 {
 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa';
 symlink( ${$pairs->[1]}, $sym );
 ${$pairs->[1]} = $sym;
 build_index ( ${$pairs->[1]}, $log );
 }
 }
 }
 sub build_index
 {
-	my $to_index = shift;
+my $to_index = shift;
-	my $log = shift;
+my $log = shift;
-	my $index_log = $to_index.'_index.err';
+my $index_log = $to_index.'_index.err';
-	`bwa index '$to_index' 2> '$index_log'`;
+`bwa index '$to_index' 2> '$index_log'`;
-	print $log "Creating index for $to_index\n";
+print $log "Creating index for $to_index\n";
 }
 sub get_unique
 {
-	my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;
+my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;
-	my $fout = $col_prefix.'_all_mappers.fastq';
+my $fout = $col_prefix.'_all_mappers.fastq';
-	my $funi = $col_prefix.'_unique_mappers.fastq';
+my $funi = $col_prefix.'_unique_mappers.fastq';
-	my $frej = $col_prefix.'_unmapped.fastq';
+my $frej = $col_prefix.'_unmapped.fastq';
 my $repartition = $out_prefix.'distribution.txt';
-	my $png_rep = $out_prefix.'distribution.png';
+my $png_rep = $out_prefix.'distribution.png';
-	my ( %duplicates, %genome_hits) ;
+my ( %duplicates, %genome_hits) ;
-	#alignement to the first reference
+#alignement to the first reference
-	my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
+my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
-	my $ref_fai = $return[4];
+my $ref_fai = $return[4];
-	my $mappers =  $return[5];
+my $mappers =    $return[5];
-	my $mappers_uni = $return[6];
+my $mappers_uni = $return[6];
-	my $size_mappedHashR = $return[7];
+my $size_mappedHashR = $return[7];
-	if ( $details == 1 )
+if ( $details == 1 )
-	{
+{
-		#print number of duplicates and hits number
+#print number of duplicates and hits number
-		my ($pourcentage, $total) =(0,0);
+my ($pourcentage, $total) =(0,0);
-		$total += $_ foreach values %{$size_mappedHashR};
+$total += $_ foreach values %{$size_mappedHashR};
-		open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
+open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
-		print $rep "size\tnumber\tpercentage\n";
+print $rep "size\tnumber\tpercentage\n";
-		foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
+foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
-		{
+{
-			$pourcentage = 0;
+$pourcentage = 0;
-			$pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
+$pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
-			print $rep "$k\t$size_mappedHashR->{$k}\t";
+print $rep "$k\t$size_mappedHashR->{$k}\t";
-			printf $rep "%.2f\n",$pourcentage;
+printf $rep "%.2f\n",$pourcentage;
-		}
+}
-		histogram($size_mappedHashR, $png_rep, $total);
+histogram($size_mappedHashR, $png_rep, $total);
-		my $dup = $out_prefix.'dup_mapnum.txt';
+my $dup = $out_prefix.'dup_mapnum.txt';
-		my $dup_u = $out_prefix .'dup_unique.txt';
+my $dup_u = $out_prefix .'dup_unique.txt';
-		my $dup_r = $out_prefix .'dup_nonmapp.txt';
+my $dup_r = $out_prefix .'dup_nonmapp.txt';
-		open(my $tab,">".$dup) || die "cannot open output txt file\n";
+open(my $tab,">".$dup) || die "cannot open output txt file\n";
-		open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
+open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
-		open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
+open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
-		print $tab "sequence\tcount\tmapnum\n";
+print $tab "sequence\tcount\tmapnum\n";
-		print $tab_u "sequence\tcount\n";
+print $tab_u "sequence\tcount\n";
-		print $tab_r "sequence\tcount\n";
+print $tab_r "sequence\tcount\n";
-		foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
+foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
-		{
+{
-			$duplicates{$k} = 0 unless exists($duplicates{$k});
+$duplicates{$k} = 0 unless exists($duplicates{$k});
-			$genome_hits{$k} = 0 unless exists($genome_hits{$k});
+$genome_hits{$k} = 0 unless exists($genome_hits{$k});
-			if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
+if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
-			else {print $tab_r $k."\t".$duplicates{$k}."\n";}
+else {print $tab_r $k."\t".$duplicates{$k}."\n";}
-			if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
+if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
-		}
+}
-	close $dup; close $dup_r; close $dup_u;
+close $dup; close $dup_r; close $dup_u;
-	}
+}
-	return ( $ref_fai, $mappers, $mappers_uni );
+return ( $ref_fai, $mappers, $mappers_uni );
 }
 sub sam_parse
 {
-	my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
+my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
-	my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
+my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
-	$mappers = $mappersUnique = $reads = 0;
+$mappers = $mappersUnique = $reads = 0;
-	open my $fic, '<', $sam || die "cannot open $sam $!\n";
+open my $fic, '<', $sam || die "cannot open $sam $!\n";
-	open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
+open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
-	open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
+open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
-	open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
+open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
-	open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
+open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
-	my $sequence = '';
+my $sequence = '';
-	while(<$fic>)
+while(<$fic>)
-	{
+{
-		chomp $_;
+chomp $_;
-		if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
+if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-		{
+{
-			if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
+if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
-			{
+{
-				$size{$1} = $2;
+$size{$1} = $2;
-				$unique_number{$1} = 0;
+$unique_number{$1} = 0;
-				$number{$1} = 0;
+$number{$1} = 0;
-				$numberNM{$1} = 0;
+$numberNM{$1} = 0;
-				$numberM{$1} = 0;
+$numberM{$1} = 0;
-			}
+}
-			print $sam_uni $_."\n";
+print $sam_uni $_."\n";
-			next;
+next;
-		}
+}
-		$reads++;
+$reads++;
-		my @line = split (/\t/,$_);
+my @line = split (/\t/,$_);
-		$sequence = $line[9];
+$sequence = $line[9];
-		if ($line[1] & 16)
+if ($line[1] & 16)
-		{
+{
-			$sequence =reverse($sequence);
+$sequence =reverse($sequence);
-			$sequence =~ tr/atgcuATGCU/tacgaTACGA/;
+$sequence =~ tr/atgcuATGCU/tacgaTACGA/;
-		}
+}
-		if ($line[1] == 16 || $line[1] == 0)
+if ($line[1] == 16 || $line[1] == 0)
-		{
+{
-			my $len = length($sequence);
+my $len = length($sequence);
-			$size_num{$len} ++;
+$size_num{$len} ++;
-			$size_num_spe{$line[2]}{$len}++;
+$size_num_spe{$line[2]}{$len}++;
-			$mappers ++;
+$mappers ++;
-			${$best_hit_number_hashR}{$sequence} = $1  if  ($line[13] =~ /X0:i:(\d*)/ ||  $line[14] =~/X0:i:(\d*)/ );
+${$best_hit_number_hashR}{$sequence} = $1    if    ($line[13] =~ /X0:i:(\d*)/ ||    $line[14] =~/X0:i:(\d*)/ );
-			${$duplicate_hashR}{$sequence}++;
+${$duplicate_hashR}{$sequence}++;
-			$number{$line[2]}++;
+$number{$line[2]}++;
-			$numberSens{$line[2]}++ if $line[1] == 0 ;
+$numberSens{$line[2]}++ if $line[1] == 0 ;
-			$numberReverse{$line[2]}++ if $line[1] == 16 ;
+$numberReverse{$line[2]}++ if $line[1] == 16 ;
-			print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
+print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
-			if ($line[11] eq "XT:A:U")
+if ($line[11] eq "XT:A:U")
-			{
+{
-				$unique_number{$line[2]}++;
+$unique_number{$line[2]}++;
-				$mappersUnique++;
+$mappersUnique++;
-				print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
+print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
-				print $sam_uni $_."\n";
+print $sam_uni $_."\n";
-			}
+}
-			if ($_ =~ /.*XM:i:(\d+).*/)
+if ($_ =~ /.*XM:i:(\d+).*/)
-			{
+{
-				if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
+if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
-			}
+}
-		}
+}
-		else
+else
-		{
+{
-			${$best_hit_number_hashR}{$sequence} = 0;
+${$best_hit_number_hashR}{$sequence} = 0;
-			${$duplicate_hashR}{$sequence}++;
+${$duplicate_hashR}{$sequence}++;
-			print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
+print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
-		}
+}
-	}
+}
-	close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
+close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
-	print $report "Parsing $sam file\n";
+print $report "Parsing $sam file\n";
 print $report "\treads: $reads\n";
-	print $report "\tmappers: $mappers\n";
+print $report "\tmappers: $mappers\n";
-	print $report "\tunique mappers: $mappersUnique\n";
+print $report "\tunique mappers: $mappersUnique\n";
-	print $report "-----------------------------\n";
+print $report "-----------------------------\n";
-	return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
+return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
 }
 sub get_hash_alignment
 {
 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ;
 my ($reads, $mappers, $unmapped) = (0,0,0);
 my $accep_unique;
 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report );
 open my $fic, '<', $sam || die "cannot open $sam $!\n";
-open my $accepted, '>', $outA || die "cannot open $outA\n"  if $accept == 1;
+open my $accepted, '>', $outA || die "cannot open $outA\n"    if $accept == 1;
-open my $rejected, '>', $outR || die "cannot open $outR\n"  if $reject == 1;
+open my $rejected, '>', $outR || die "cannot open $outR\n"    if $reject == 1;
 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f;
 #if ($name eq "snRNAs") {
-#  open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
+#    open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
 #}
 my $sequence = '';
 while(<$fic>)
 {
 chomp $_;
 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 {
-	if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
+if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 {
 print $fai $1."\t".$2."\n";
 }
 next;
 }
 $reads++;
 my @line = split (/\t/,$_);
 $sequence = $line[9];
 if ($line[1] & 16)
 {
 $sequence =reverse($sequence);
 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
 }
 if ($line[1] & 16 || $line[1] == 0)
 {
 $mappers ++;
 if ($accept == 1 )
 {
 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
-#     print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U");
+#         print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U");
 }
 }
 else
 {
 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1;
 $unmapped++;
 }
 }
 # close $accep_unique if ($name eq "bonafide_reads");
 close $fic;
-close $accepted  if $accept == 1;
+close $accepted    if $accept == 1;
 close $rejected if $reject ==1;
 close $fai if $fai_f;
 print $report "\treads: $reads\n";
 print $report "\tmappers: $mappers\n";
 print $report "\tunmapped: $unmapped\n";
 print $report "-----------------------------\n";
 return ($mappers, $unmapped);
 }
 sub sam_count
 {
 my $sam = shift;
 my ( %number, %size );
-open  my $fic, '<', $sam || die "cannot open $sam file $!\n";
+open    my $fic, '<', $sam || die "cannot open $sam file $!\n";
 while(<$fic>)
 {
 chomp $_;
 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
 {
 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
 {
 $size{$1} = $2;
 $number{$1} = 0;
 }
 }
 else
 {
-		my @line = split (/\t/,$_);
+my @line = split (/\t/,$_);
-	  	if ( $line[1]  & 16 || $line[1] == 0 )
+if ( $line[1]    & 16 || $line[1] == 0 )
-	  	{
+{
-	  		$number{$line[2]}++;
+$number{$line[2]}++;
-	  	}
+}
-	 	}
-	}
-close $fic;
-return ( \%number, \%size );
-}
-sub sam_count_mis
-{
-	my $sam = shift;
-	my ( %number, %numberNM, %numberM, %size);
-	open  my $fic, '<', $sam || die "cannot open $sam file $!\n";
-while(<$fic>)
-{
-		chomp $_;
-if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-{
-if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
-{
-				$size{$1} = $2;
-				$number{$1} = [0,0,0,0,0,0,0];
-				$numberNM{$1} = [0,0,0,0,0,0,0];
-				$numberM{$1} = [0,0,0,0,0,0,0];
-			}
-		}
-		else
-		{
-			my @line = split (/\t/,$_);
-my @seq = split //, $line[9];
-			if ( $line[1]  == 16 || $line[1] == 0 )
-			{
-				$number{ $line[2] }->[0]++;
-if ($line[1]  == 0)
-{
-$number{$line[2]}->[1]++;
-$number{$line[2]}->[3]++ if $seq[0] eq 'T';
-$number{$line[2]}->[5]++ if $seq[9] eq 'A';
-}
-else
-{
-$number{$line[2]}->[2]++;
-$number{$line[2]}->[4]++ if $seq[9] eq 'A';
-$number{$line[2]}->[6]++ if $seq[0] eq 'T';
-}
-	 	if ($_ =~ /.*XM:i:(\d+).*/)
-	 	{
-	if ( $1 == 0 )
-{
-$numberNM{$line[2]}->[0]++;
-if ($line[1]  == 0)
-{
-$numberNM{$line[2]}->[1]++;
-$numberNM{$line[2]}->[3]++ if $seq[0] eq 'T';
-$numberNM{$line[2]}->[5]++ if $seq[9] eq 'A';
-}
-else
-{
-$numberNM{$line[2]}->[2]++;
-$numberNM{$line[2]}->[4]++ if $seq[9] eq 'A';
-$numberNM{$line[2]}->[6]++ if $seq[0] eq 'T';
-}
 }
-else
+}
-{
+close $fic;
-$numberM{$line[2]}->[0]++;
+return ( \%number, \%size );
-if ($line[1]  == 0)
+}
-{
-$numberM{$line[2]}->[1]++;
+sub sam_count_mis
-$numberM{$line[2]}->[3]++ if $seq[0] eq 'T';
+{
-$numberM{$line[2]}->[5]++ if $seq[9] eq 'A';
-}
+my $sam = shift;
-else
+my ( %number, %numberNM, %numberM, %size);
-{
-$numberM{$line[2]}->[2]++;
+open    my $fic, '<', $sam || die "cannot open $sam file $!\n";
-$numberM{$line[2]}->[4]++ if $seq[9] eq 'A';
+while(<$fic>)
-$numberM{$line[2]}->[6]++ if $seq[0] eq 'T';
+{
-}
+chomp $_;
-}
+if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-	}
+{
-			}
+if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
-		}
+{
-	}
+$size{$1} = $2;
-	return (\%number, \%size, \%numberNM, \%numberM );
+$number{$1} = [0,0,0,0,0,0,0];
+$numberNM{$1} = [0,0,0,0,0,0,0];
+$numberM{$1} = [0,0,0,0,0,0,0];
+}
+}
+else
+{
+my @line = split (/\t/,$_);
+my @seq = split //, $line[9];
+if ( $line[1]    == 16 || $line[1] == 0 )
+{
+$number{ $line[2] }->[0]++;
+if ($line[1]    == 0)
+{
+$number{$line[2]}->[1]++;
+$number{$line[2]}->[3]++ if $seq[0] eq 'T';
+$number{$line[2]}->[5]++ if $seq[9] eq 'A';
+}
+else
+{
+$number{$line[2]}->[2]++;
+$number{$line[2]}->[4]++ if $seq[9] eq 'A';
+$number{$line[2]}->[6]++ if $seq[0] eq 'T';
+}
+if ($_ =~ /.*XM:i:(\d+).*/)
+{
+if ( $1 == 0 )
+{
+$numberNM{$line[2]}->[0]++;
+if ($line[1]    == 0)
+{
+$numberNM{$line[2]}->[1]++;
+$numberNM{$line[2]}->[3]++ if $seq[0] eq 'T';
+$numberNM{$line[2]}->[5]++ if $seq[9] eq 'A';
+}
+else
+{
+$numberNM{$line[2]}->[2]++;
+$numberNM{$line[2]}->[4]++ if $seq[9] eq 'A';
+$numberNM{$line[2]}->[6]++ if $seq[0] eq 'T';
+}
+}
+else
+{
+$numberM{$line[2]}->[0]++;
+if ($line[1]    == 0)
+{
+$numberM{$line[2]}->[1]++;
+$numberM{$line[2]}->[3]++ if $seq[0] eq 'T';
+$numberM{$line[2]}->[5]++ if $seq[9] eq 'A';
+}
+else
+{
+$numberM{$line[2]}->[2]++;
+$numberM{$line[2]}->[4]++ if $seq[9] eq 'A';
+$numberM{$line[2]}->[6]++ if $seq[0] eq 'T';
+}
+}
+}
+}
+}
+}
+return (\%number, \%size, \%numberNM, \%numberM );
 }
 sub rpms_rpkm_te
 {
 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
 print $out "ID\treads counts\tRPKM";
 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
 print $out "\tsense reads counts\treverse reads counts";
 print $out "\t% of sense 1U\t% of sense 10A\t% of reverse 1U\t% of reverse 10A\n";
-foreach my $k  ( sort keys %{$counthashR} )
+foreach my $k    ( sort keys %{$counthashR} )
 {
 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
+$rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
+print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm;
+if ($piRNA_number != 0 )
+{
+$pirna = ( $counthashR->{$k}->[0]    * 1000000) / $piRNA_number;
+printf $out "\t%.2f",$pirna;
+}
+if ($miRNA_number != 0 )
+{
+$mirna = ( $counthashR->{$k}->[0]    * 1000000) / $miRNA_number;
+printf $out "\t%.2f",$mirna;
+}
+if ($bonafide_number != 0 )
+{
+$bonafide = ( $counthashR->{$k}->[0]    * 1000000) / $bonafide_number;
+printf $out "\t%.2f",$bonafide;
+}
+print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ;
+my $S1U = 0;
+$S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
+my $R1U = 0;
+$R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
+my $S10A = 0;
+$S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
+my $R10A = 0;
+$R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
+print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A;
+print $out "\n";
+}
+close $out;
+}
+sub sam_to_bam_bg
+{
+my ( $sam, $scale, $number_of_cpus ) = @_;
+my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
+if ( $sam =~ /(.*?).sam$/ )
+{
+$bam_sorted = $1.'_sorted.bam';
+$bedgraphP= $1.'_plus.bedgraph';
+$bedgraphM = $1.'_minus.bedgraph';
+$view_err = $1.'_view.err';
+$sort_err = $1.'_sort.err';
+}
+`samtools view -Shb     --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort    -O BAM --threads $number_of_cpus    /dev/stdin 2> '$sort_err'    > '$bam_sorted'`;
+`bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`;
+`bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`;
+}
+sub sam_sorted_bam
+{
+my ( $sam, $number_of_cpus ) = @_;
+my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
+if ( $sam =~ /(.*?).sam$/ )
+{
+$bam_sorted = $1.'_sorted.bam';
+$view_err = $1.'_view.err';
+$sort_err = $1.'_sort.err';
+}
+`samtools view -Shb     --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort    -O BAM --threads $number_of_cpus    /dev/stdin    2> '$sort_err'    > '$bam_sorted'`;
+}
+sub BWA_call
+{
+my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
+my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
+print $report "-----------------------------\n";
+print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n";
+`bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `;
+}
+sub rpms_rpkm
+{
+my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
+open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
+print $out "ID\treads counts\tRPKM";
+print $out "\tper million of piRNAs" if ($piRNA_number != 0);
+print $out "\tper million of miRNAs" if ($miRNA_number != 0);
+print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
+print $out "\n";
+foreach my $k    ( sort keys %{$counthashR} )
+{
+my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
+$rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
+print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
+if ($piRNA_number != 0 )
+{
+$pirna = ( $counthashR->{$k}    * 1000000) / $piRNA_number;
+printf $out "\t%.2f",$pirna;
+}
+if ($miRNA_number != 0 )
+{
+$mirna = ( $counthashR->{$k}    * 1000000) / $miRNA_number;
+printf $out "\t%.2f",$mirna;
+}
+if ($bonafide_number != 0 )
+{
+$bonafide = ( $counthashR->{$k}    * 1000000) / $bonafide_number;
+printf $out "\t%.2f",$bonafide;
+}
+print $out "\n";
+}
+close $out;
+}
+sub extract_sam
+{
+my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
-$rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
+open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
-print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm;
+open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n";
-if ($piRNA_number != 0 )
+open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n";
-{
-$pirna = ( $counthashR->{$k}->[0]  * 1000000) / $piRNA_number;
-printf $out "\t%.2f",$pirna;
-}
-if ($miRNA_number != 0 )
-{
-$mirna = ( $counthashR->{$k}->[0]  * 1000000) / $miRNA_number;
-printf $out "\t%.2f",$mirna;
-}
-if ($bonafide_number != 0 )
-{
-$bonafide = ( $counthashR->{$k}->[0]  * 1000000) / $bonafide_number;
-printf $out "\t%.2f",$bonafide;
-}
-print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ;
-my $S1U = 0;
-$S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
-my $R1U = 0;
-$R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
-my $S10A = 0;
-$S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
-my $R10A = 0;
-$R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
-print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A;
-print $out "\n";
-}
-close $out;
-}
-sub sam_to_bam_bg
-{
-	my ( $sam, $scale, $number_of_cpus ) = @_;
-	my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
-	if ( $sam =~ /(.*?).sam$/ )
-	{
-		$bam_sorted = $1.'_sorted.bam';
-		$bedgraphP= $1.'_plus.bedgraph';
-		$bedgraphM = $1.'_minus.bedgraph';
-		$view_err = $1.'_view.err';
-		$sort_err = $1.'_sort.err';
-	}
-	`samtools view -Shb   --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort  -O BAM --threads $number_of_cpus  /dev/stdin 2> '$sort_err'  > '$bam_sorted'`;
-	`bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`;
-	`bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`;
-}
-sub sam_sorted_bam
-{
-	my ( $sam, $number_of_cpus ) = @_;
-	my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
-	if ( $sam =~ /(.*?).sam$/ )
-	{
-		$bam_sorted = $1.'_sorted.bam';
-		$view_err = $1.'_view.err';
-		$sort_err = $1.'_sort.err';
-	}
-	`samtools view -Shb   --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort  -O BAM --threads $number_of_cpus  /dev/stdin  2> '$sort_err'  > '$bam_sorted'`;
-}
-sub BWA_call
-{
-	my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
-	my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
-	print $report "-----------------------------\n";
-	print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n";
-	`bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `;
-}
-sub rpms_rpkm
-{
-my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
-open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
-print $out "ID\treads counts\tRPKM";
-print $out "\tper million of piRNAs" if ($piRNA_number != 0);
-print $out "\tper million of miRNAs" if ($miRNA_number != 0);
-print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
-print $out "\n";
-foreach my $k  ( sort keys %{$counthashR} )
-{
-my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
-$rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
+open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
-print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
+open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
-if ($piRNA_number != 0 )
+my $sequence = '';
-{
+while(<$s_in>)
-$pirna = ( $counthashR->{$k}  * 1000000) / $piRNA_number;
+{
-printf $out "\t%.2f",$pirna;
+if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-}
+{
-if ($miRNA_number != 0 )
+print $s_out $_ if defined ($hashRef);
-{
+print $s_uni_out $_;
-$mirna = ( $counthashR->{$k}  * 1000000) / $miRNA_number;
+next;
-printf $out "\t%.2f",$mirna;
+}
-}
+my @line = split (/\t/,$_);
-if ($bonafide_number != 0 )
+$sequence = $line[0];
-{
+if ( (! defined ($hashRef) )|| (    exists $hashRef->{$sequence}    &&    $hashRef->{$sequence} == 1 ) )
-$bonafide = ( $counthashR->{$k}  * 1000000) / $bonafide_number;
+{
-printf $out "\t%.2f",$bonafide;
+my $arn    =    $line[9];
-}
+if ($line[1] & 16)
-print $out "\n";
+{
-	}
+$arn =reverse($arn);
-close $out;
+$arn =~ tr/atgcuATGCU/tacgaTACGA/;
 }
-sub extract_sam
+if    ( ( $line[1] == 16 || $line[1] == 0 ) )
 {
-my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
+print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
+print $s_out $_ if defined ($hashRef);
-	open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
+if ( $line[11] eq "XT:A:U" )
+{
-open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n";
+print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
-open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n";
+print $s_uni_out $_ ;
+}
-open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
+}
-open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
+}
+}
-my $sequence = '';
+close $s_in; close $s_out if defined ($hashRef);
-while(<$s_in>)
+close $s_uni_out; close $f_out; close $f_uni_out;
-{
-if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
-{
-print $s_out $_ if defined ($hashRef);
-print $s_uni_out $_;
-next;
-}
-my @line = split (/\t/,$_);
-$sequence = $line[0];
-if ( (! defined ($hashRef) )|| (  exists $hashRef->{$sequence}  &&  $hashRef->{$sequence} == 1 ) )
-{
-my $arn  =  $line[9];
-if ($line[1] & 16)
-{
-$arn =reverse($arn);
-$arn =~ tr/atgcuATGCU/tacgaTACGA/;
-}
-if  ( ( $line[1] == 16 || $line[1] == 0 ) )
-			{
-	print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
-	print $s_out $_ if defined ($hashRef);
-	if ( $line[11] eq "XT:A:U" )
-	{
-		print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
-		print $s_uni_out $_ ;
-	}
-}
-}
-}
-close $s_in; close $s_out if defined ($hashRef);
-close $s_uni_out; close $f_out; close $f_uni_out;
 }
 sub get_fastq_seq
 {
 my $fastq = shift;
 my %hash; my $cmp = 0;
 open my $fic, '<', $fastq || die "cannot open input file $! \n";
 while(<$fic>)
 {
 chomp $_;
 $cmp++;
 if ($cmp % 4 == 1)
 {
 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;}
 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;}
 }
 elsif ($cmp % 4 == 3 )
 {
 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
 }
 }
 close $fic;
 return \%hash;
 }
 1;

Mercurial > repos > brasset_jensen > srnapipe

comparison bin/align.pm @ 61:9185ca0a7b43 draft