comparison bin/align.pm @ 61:9185ca0a7b43 draft

Updated package according to recommendations.
author pierre.pouchin
date Wed, 16 Jan 2019 08:18:13 -0500
parents 9645d995fb3c
children
comparison
equal deleted inserted replaced
60:9645d995fb3c 61:9185ca0a7b43
8 use FindBin; 8 use FindBin;
9 use lib $FindBin::Bin; 9 use lib $FindBin::Bin;
10 use Rcall qw ( histogram ); 10 use Rcall qw ( histogram );
11 11
12 use Exporter; 12 use Exporter;
13 our @ISA = qw( Exporter ); 13 our @ISA = qw( Exporter );
14 our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam ); 14 our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
15 15
16 sub to_build 16 sub to_build
17 { 17 {
18 my ( $toBuildTabP, $log, $newdir ) = @_; 18 my ( $toBuildTabP, $log, $newdir ) = @_;
19 19
20 foreach my $pairs ( @{ $toBuildTabP } ) 20 foreach my $pairs ( @{ $toBuildTabP } )
21 { 21 {
22 if ( $pairs->[0] == 1 ) 22 if ( $pairs->[0] == 1 )
23 { 23 {
24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa'; 24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa';
25 symlink( ${$pairs->[1]}, $sym ); 25 symlink( ${$pairs->[1]}, $sym );
26 ${$pairs->[1]} = $sym; 26 ${$pairs->[1]} = $sym;
27 build_index ( ${$pairs->[1]}, $log ); 27 build_index ( ${$pairs->[1]}, $log );
28 } 28 }
29 } 29 }
30 } 30 }
31 31
32 sub build_index 32 sub build_index
33 { 33 {
34 my $to_index = shift; 34 my $to_index = shift;
35 my $log = shift; 35 my $log = shift;
36 my $index_log = $to_index.'_index.err'; 36 my $index_log = $to_index.'_index.err';
37 `bwa index '$to_index' 2> '$index_log'`; 37 `bwa index '$to_index' 2> '$index_log'`;
38 print $log "Creating index for $to_index\n"; 38 print $log "Creating index for $to_index\n";
39 } 39 }
40 40
41 sub get_unique 41 sub get_unique
42 { 42 {
43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_; 43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;
44 44
45 my $fout = $col_prefix.'_all_mappers.fastq'; 45 my $fout = $col_prefix.'_all_mappers.fastq';
46 my $funi = $col_prefix.'_unique_mappers.fastq'; 46 my $funi = $col_prefix.'_unique_mappers.fastq';
47 my $frej = $col_prefix.'_unmapped.fastq'; 47 my $frej = $col_prefix.'_unmapped.fastq';
48 48
49 my $repartition = $out_prefix.'distribution.txt'; 49 my $repartition = $out_prefix.'distribution.txt';
50 my $png_rep = $out_prefix.'distribution.png'; 50 my $png_rep = $out_prefix.'distribution.png';
51 my ( %duplicates, %genome_hits) ; 51 my ( %duplicates, %genome_hits) ;
52 52
53 #alignement to the first reference 53 #alignement to the first reference
54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report ); 54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
55 my $ref_fai = $return[4]; 55 my $ref_fai = $return[4];
56 my $mappers = $return[5]; 56 my $mappers = $return[5];
57 my $mappers_uni = $return[6]; 57 my $mappers_uni = $return[6];
58 my $size_mappedHashR = $return[7]; 58 my $size_mappedHashR = $return[7];
59 59
60 if ( $details == 1 ) 60 if ( $details == 1 )
61 { 61 {
62 #print number of duplicates and hits number 62 #print number of duplicates and hits number
63 my ($pourcentage, $total) =(0,0); 63 my ($pourcentage, $total) =(0,0);
64 64
65 $total += $_ foreach values %{$size_mappedHashR}; 65 $total += $_ foreach values %{$size_mappedHashR};
66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n"; 66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
67 print $rep "size\tnumber\tpercentage\n"; 67 print $rep "size\tnumber\tpercentage\n";
68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR})) 68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
69 { 69 {
70 $pourcentage = 0; 70 $pourcentage = 0;
71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0; 71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
72 72
73 print $rep "$k\t$size_mappedHashR->{$k}\t"; 73 print $rep "$k\t$size_mappedHashR->{$k}\t";
74 printf $rep "%.2f\n",$pourcentage; 74 printf $rep "%.2f\n",$pourcentage;
75 } 75 }
76 76
77 histogram($size_mappedHashR, $png_rep, $total); 77 histogram($size_mappedHashR, $png_rep, $total);
78 78
79 79
80 my $dup = $out_prefix.'dup_mapnum.txt'; 80 my $dup = $out_prefix.'dup_mapnum.txt';
81 my $dup_u = $out_prefix .'dup_unique.txt'; 81 my $dup_u = $out_prefix .'dup_unique.txt';
82 my $dup_r = $out_prefix .'dup_nonmapp.txt'; 82 my $dup_r = $out_prefix .'dup_nonmapp.txt';
83 open(my $tab,">".$dup) || die "cannot open output txt file\n"; 83 open(my $tab,">".$dup) || die "cannot open output txt file\n";
84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n"; 84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n"; 85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
86 print $tab "sequence\tcount\tmapnum\n"; 86 print $tab "sequence\tcount\tmapnum\n";
87 print $tab_u "sequence\tcount\n"; 87 print $tab_u "sequence\tcount\n";
88 print $tab_r "sequence\tcount\n"; 88 print $tab_r "sequence\tcount\n";
89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates) 89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
90 { 90 {
91 $duplicates{$k} = 0 unless exists($duplicates{$k}); 91 $duplicates{$k} = 0 unless exists($duplicates{$k});
92 $genome_hits{$k} = 0 unless exists($genome_hits{$k}); 92 $genome_hits{$k} = 0 unless exists($genome_hits{$k});
93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; } 93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
94 else {print $tab_r $k."\t".$duplicates{$k}."\n";} 94 else {print $tab_r $k."\t".$duplicates{$k}."\n";}
95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; } 95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
96 } 96 }
97 close $dup; close $dup_r; close $dup_u; 97 close $dup; close $dup_r; close $dup_u;
98 } 98 }
99 return ( $ref_fai, $mappers, $mappers_uni ); 99 return ( $ref_fai, $mappers, $mappers_uni );
100 } 100 }
101 101
102 sub sam_parse 102 sub sam_parse
103 { 103 {
104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ; 104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size); 105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
106 $mappers = $mappersUnique = $reads = 0; 106 $mappers = $mappersUnique = $reads = 0;
107 107
108 open my $fic, '<', $sam || die "cannot open $sam $!\n"; 108 open my $fic, '<', $sam || die "cannot open $sam $!\n";
109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n"; 109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n"; 110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n"; 111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n"; 112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
113 my $sequence = ''; 113 my $sequence = '';
114 while(<$fic>) 114 while(<$fic>)
115 { 115 {
116 chomp $_; 116 chomp $_;
117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) 117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
118 { 118 {
119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) 119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
120 { 120 {
121 $size{$1} = $2; 121 $size{$1} = $2;
122 $unique_number{$1} = 0; 122 $unique_number{$1} = 0;
123 $number{$1} = 0; 123 $number{$1} = 0;
124 $numberNM{$1} = 0; 124 $numberNM{$1} = 0;
125 $numberM{$1} = 0; 125 $numberM{$1} = 0;
126 } 126 }
127 print $sam_uni $_."\n"; 127 print $sam_uni $_."\n";
128 next; 128 next;
129 } 129 }
130 $reads++; 130 $reads++;
131 my @line = split (/\t/,$_); 131 my @line = split (/\t/,$_);
132 $sequence = $line[9]; 132 $sequence = $line[9];
133 if ($line[1] & 16) 133 if ($line[1] & 16)
134 { 134 {
135 $sequence =reverse($sequence); 135 $sequence =reverse($sequence);
136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; 136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
137 } 137 }
138 if ($line[1] == 16 || $line[1] == 0) 138 if ($line[1] == 16 || $line[1] == 0)
139 { 139 {
140 my $len = length($sequence); 140 my $len = length($sequence);
141 $size_num{$len} ++; 141 $size_num{$len} ++;
142 $size_num_spe{$line[2]}{$len}++; 142 $size_num_spe{$line[2]}{$len}++;
143 $mappers ++; 143 $mappers ++;
144 144
145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ ); 145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ );
146 ${$duplicate_hashR}{$sequence}++; 146 ${$duplicate_hashR}{$sequence}++;
147 $number{$line[2]}++; 147 $number{$line[2]}++;
148 $numberSens{$line[2]}++ if $line[1] == 0 ; 148 $numberSens{$line[2]}++ if $line[1] == 0 ;
149 $numberReverse{$line[2]}++ if $line[1] == 16 ; 149 $numberReverse{$line[2]}++ if $line[1] == 16 ;
150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; 150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
151 151
152 if ($line[11] eq "XT:A:U") 152 if ($line[11] eq "XT:A:U")
153 { 153 {
154 $unique_number{$line[2]}++; 154 $unique_number{$line[2]}++;
155 $mappersUnique++; 155 $mappersUnique++;
156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; 156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
157 print $sam_uni $_."\n"; 157 print $sam_uni $_."\n";
158 } 158 }
159 if ($_ =~ /.*XM:i:(\d+).*/) 159 if ($_ =~ /.*XM:i:(\d+).*/)
160 { 160 {
161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;} 161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
162 } 162 }
163 } 163 }
164 else 164 else
165 { 165 {
166 ${$best_hit_number_hashR}{$sequence} = 0; 166 ${$best_hit_number_hashR}{$sequence} = 0;
167 ${$duplicate_hashR}{$sequence}++; 167 ${$duplicate_hashR}{$sequence}++;
168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; 168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
169 } 169 }
170 } 170 }
171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni; 171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
172 172
173 print $report "Parsing $sam file\n"; 173 print $report "Parsing $sam file\n";
174 print $report "\treads: $reads\n"; 174 print $report "\treads: $reads\n";
175 print $report "\tmappers: $mappers\n"; 175 print $report "\tmappers: $mappers\n";
176 print $report "\tunique mappers: $mappersUnique\n"; 176 print $report "\tunique mappers: $mappersUnique\n";
177 print $report "-----------------------------\n"; 177 print $report "-----------------------------\n";
178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM ); 178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
179 } 179 }
180 180
181 sub get_hash_alignment 181 sub get_hash_alignment
182 { 182 {
183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ; 183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ;
184 my ($reads, $mappers, $unmapped) = (0,0,0); 184 my ($reads, $mappers, $unmapped) = (0,0,0);
185 my $accep_unique; 185 my $accep_unique;
186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ); 186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report );
187 187
188 open my $fic, '<', $sam || die "cannot open $sam $!\n"; 188 open my $fic, '<', $sam || die "cannot open $sam $!\n";
189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1; 189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1;
190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1; 190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1;
191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f; 191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f;
192 #if ($name eq "snRNAs") { 192 #if ($name eq "snRNAs") {
193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/; 193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
194 #} 194 #}
195 my $sequence = ''; 195 my $sequence = '';
196 while(<$fic>) 196 while(<$fic>)
197 { 197 {
198 chomp $_; 198 chomp $_;
199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) 199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
200 { 200 {
201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) 201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
202 { 202 {
203 print $fai $1."\t".$2."\n"; 203 print $fai $1."\t".$2."\n";
204 } 204 }
205 next; 205 next;
206 } 206 }
207 $reads++; 207 $reads++;
208 my @line = split (/\t/,$_); 208 my @line = split (/\t/,$_);
209 $sequence = $line[9]; 209 $sequence = $line[9];
210 if ($line[1] & 16) 210 if ($line[1] & 16)
211 { 211 {
212 $sequence =reverse($sequence); 212 $sequence =reverse($sequence);
213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; 213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
214 } 214 }
215 if ($line[1] & 16 || $line[1] == 0) 215 if ($line[1] & 16 || $line[1] == 0)
216 { 216 {
217 $mappers ++; 217 $mappers ++;
218 if ($accept == 1 ) 218 if ($accept == 1 )
219 { 219 {
220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; 220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); 221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U");
222 } 222 }
223 } 223 }
224 else 224 else
225 { 225 {
226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1; 226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1;
227 $unmapped++; 227 $unmapped++;
228 } 228 }
229 } 229 }
230 # close $accep_unique if ($name eq "bonafide_reads"); 230 # close $accep_unique if ($name eq "bonafide_reads");
231 close $fic; 231 close $fic;
232 close $accepted if $accept == 1; 232 close $accepted if $accept == 1;
233 close $rejected if $reject ==1; 233 close $rejected if $reject ==1;
234 close $fai if $fai_f; 234 close $fai if $fai_f;
235 print $report "\treads: $reads\n"; 235 print $report "\treads: $reads\n";
236 print $report "\tmappers: $mappers\n"; 236 print $report "\tmappers: $mappers\n";
237 print $report "\tunmapped: $unmapped\n"; 237 print $report "\tunmapped: $unmapped\n";
238 print $report "-----------------------------\n"; 238 print $report "-----------------------------\n";
239 return ($mappers, $unmapped); 239 return ($mappers, $unmapped);
240 } 240 }
241 241
242 sub sam_count 242 sub sam_count
243 { 243 {
244 my $sam = shift; 244 my $sam = shift;
245 my ( %number, %size ); 245 my ( %number, %size );
246 246
247 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; 247 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
248 while(<$fic>) 248 while(<$fic>)
249 { 249 {
250 chomp $_; 250 chomp $_;
251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) 251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
252 { 252 {
253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) 253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
254 { 254 {
255 $size{$1} = $2; 255 $size{$1} = $2;
256 $number{$1} = 0; 256 $number{$1} = 0;
257 } 257 }
258 } 258 }
259 else 259 else
260 { 260 {
261 my @line = split (/\t/,$_); 261 my @line = split (/\t/,$_);
262 if ( $line[1] & 16 || $line[1] == 0 ) 262 if ( $line[1] & 16 || $line[1] == 0 )
263 { 263 {
264 $number{$line[2]}++; 264 $number{$line[2]}++;
265 } 265 }
266 }
267 }
268 close $fic;
269 return ( \%number, \%size );
270 }
271
272 sub sam_count_mis
273 {
274
275 my $sam = shift;
276 my ( %number, %numberNM, %numberM, %size);
277
278 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
279 while(<$fic>)
280 {
281 chomp $_;
282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
283 {
284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
285 {
286 $size{$1} = $2;
287 $number{$1} = [0,0,0,0,0,0,0];
288 $numberNM{$1} = [0,0,0,0,0,0,0];
289 $numberM{$1} = [0,0,0,0,0,0,0];
290 }
291 }
292 else
293 {
294 my @line = split (/\t/,$_);
295 my @seq = split //, $line[9];
296 if ( $line[1] == 16 || $line[1] == 0 )
297 {
298 $number{ $line[2] }->[0]++;
299 if ($line[1] == 0)
300 {
301 $number{$line[2]}->[1]++;
302 $number{$line[2]}->[3]++ if $seq[0] eq 'T';
303 $number{$line[2]}->[5]++ if $seq[9] eq 'A';
304 }
305 else
306 {
307 $number{$line[2]}->[2]++;
308 $number{$line[2]}->[4]++ if $seq[9] eq 'A';
309 $number{$line[2]}->[6]++ if $seq[0] eq 'T';
310 }
311 if ($_ =~ /.*XM:i:(\d+).*/)
312 {
313 if ( $1 == 0 )
314 {
315 $numberNM{$line[2]}->[0]++;
316 if ($line[1] == 0)
317 {
318 $numberNM{$line[2]}->[1]++;
319 $numberNM{$line[2]}->[3]++ if $seq[0] eq 'T';
320 $numberNM{$line[2]}->[5]++ if $seq[9] eq 'A';
321 }
322 else
323 {
324 $numberNM{$line[2]}->[2]++;
325 $numberNM{$line[2]}->[4]++ if $seq[9] eq 'A';
326 $numberNM{$line[2]}->[6]++ if $seq[0] eq 'T';
327 }
328 } 266 }
329 else 267 }
330 { 268 close $fic;
331 $numberM{$line[2]}->[0]++; 269 return ( \%number, \%size );
332 if ($line[1] == 0) 270 }
333 { 271
334 $numberM{$line[2]}->[1]++; 272 sub sam_count_mis
335 $numberM{$line[2]}->[3]++ if $seq[0] eq 'T'; 273 {
336 $numberM{$line[2]}->[5]++ if $seq[9] eq 'A'; 274
337 } 275 my $sam = shift;
338 else 276 my ( %number, %numberNM, %numberM, %size);
339 { 277
340 $numberM{$line[2]}->[2]++; 278 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
341 $numberM{$line[2]}->[4]++ if $seq[9] eq 'A'; 279 while(<$fic>)
342 $numberM{$line[2]}->[6]++ if $seq[0] eq 'T'; 280 {
343 } 281 chomp $_;
344 } 282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
345 } 283 {
346 } 284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
347 } 285 {
348 } 286 $size{$1} = $2;
349 return (\%number, \%size, \%numberNM, \%numberM ); 287 $number{$1} = [0,0,0,0,0,0,0];
288 $numberNM{$1} = [0,0,0,0,0,0,0];
289 $numberM{$1} = [0,0,0,0,0,0,0];
290 }
291 }
292 else
293 {
294 my @line = split (/\t/,$_);
295 my @seq = split //, $line[9];
296 if ( $line[1] == 16 || $line[1] == 0 )
297 {
298 $number{ $line[2] }->[0]++;
299 if ($line[1] == 0)
300 {
301 $number{$line[2]}->[1]++;
302 $number{$line[2]}->[3]++ if $seq[0] eq 'T';
303 $number{$line[2]}->[5]++ if $seq[9] eq 'A';
304 }
305 else
306 {
307 $number{$line[2]}->[2]++;
308 $number{$line[2]}->[4]++ if $seq[9] eq 'A';
309 $number{$line[2]}->[6]++ if $seq[0] eq 'T';
310 }
311 if ($_ =~ /.*XM:i:(\d+).*/)
312 {
313 if ( $1 == 0 )
314 {
315 $numberNM{$line[2]}->[0]++;
316 if ($line[1] == 0)
317 {
318 $numberNM{$line[2]}->[1]++;
319 $numberNM{$line[2]}->[3]++ if $seq[0] eq 'T';
320 $numberNM{$line[2]}->[5]++ if $seq[9] eq 'A';
321 }
322 else
323 {
324 $numberNM{$line[2]}->[2]++;
325 $numberNM{$line[2]}->[4]++ if $seq[9] eq 'A';
326 $numberNM{$line[2]}->[6]++ if $seq[0] eq 'T';
327 }
328 }
329 else
330 {
331 $numberM{$line[2]}->[0]++;
332 if ($line[1] == 0)
333 {
334 $numberM{$line[2]}->[1]++;
335 $numberM{$line[2]}->[3]++ if $seq[0] eq 'T';
336 $numberM{$line[2]}->[5]++ if $seq[9] eq 'A';
337 }
338 else
339 {
340 $numberM{$line[2]}->[2]++;
341 $numberM{$line[2]}->[4]++ if $seq[9] eq 'A';
342 $numberM{$line[2]}->[6]++ if $seq[0] eq 'T';
343 }
344 }
345 }
346 }
347 }
348 }
349 return (\%number, \%size, \%numberNM, \%numberM );
350 } 350 }
351 351
352 sub rpms_rpkm_te 352 sub rpms_rpkm_te
353 { 353 {
354 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; 354 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
355 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; 355 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
356 print $out "ID\treads counts\tRPKM"; 356 print $out "ID\treads counts\tRPKM";
357 print $out "\tper million of piRNAs" if ($piRNA_number != 0); 357 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
358 print $out "\tper million of miRNAs" if ($miRNA_number != 0); 358 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
359 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); 359 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
360 print $out "\tsense reads counts\treverse reads counts"; 360 print $out "\tsense reads counts\treverse reads counts";
361 print $out "\t% of sense 1U\t% of sense 10A\t% of reverse 1U\t% of reverse 10A\n"; 361 print $out "\t% of sense 1U\t% of sense 10A\t% of reverse 1U\t% of reverse 10A\n";
362 foreach my $k ( sort keys %{$counthashR} ) 362 foreach my $k ( sort keys %{$counthashR} )
363 { 363 {
364 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); 364 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
365
366 $rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
367 print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm;
368
369 if ($piRNA_number != 0 )
370 {
371 $pirna = ( $counthashR->{$k}->[0] * 1000000) / $piRNA_number;
372 printf $out "\t%.2f",$pirna;
373 }
374 if ($miRNA_number != 0 )
375 {
376 $mirna = ( $counthashR->{$k}->[0] * 1000000) / $miRNA_number;
377 printf $out "\t%.2f",$mirna;
378 }
379 if ($bonafide_number != 0 )
380 {
381 $bonafide = ( $counthashR->{$k}->[0] * 1000000) / $bonafide_number;
382 printf $out "\t%.2f",$bonafide;
383 }
384
385 print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ;
386 my $S1U = 0;
387 $S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
388 my $R1U = 0;
389 $R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
390 my $S10A = 0;
391 $S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
392 my $R10A = 0;
393 $R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
394 print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A;
395
396 print $out "\n";
397 }
398 close $out;
399 }
400
401
402 sub sam_to_bam_bg
403 {
404 my ( $sam, $scale, $number_of_cpus ) = @_;
405 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
406 if ( $sam =~ /(.*?).sam$/ )
407 {
408 $bam_sorted = $1.'_sorted.bam';
409 $bedgraphP= $1.'_plus.bedgraph';
410 $bedgraphM = $1.'_minus.bedgraph';
411 $view_err = $1.'_view.err';
412 $sort_err = $1.'_sort.err';
413 }
414 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`;
415 `bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`;
416 `bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`;
417 }
418
419 sub sam_sorted_bam
420 {
421 my ( $sam, $number_of_cpus ) = @_;
422 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
423 if ( $sam =~ /(.*?).sam$/ )
424 {
425 $bam_sorted = $1.'_sorted.bam';
426 $view_err = $1.'_view.err';
427 $sort_err = $1.'_sort.err';
428
429 }
430 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`;
431 }
432
433 sub BWA_call
434 {
435 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
436 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
437 print $report "-----------------------------\n";
438 print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n";
439 `bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `;
440 }
441
442 sub rpms_rpkm
443 {
444 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
445 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
446 print $out "ID\treads counts\tRPKM";
447 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
448 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
449 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
450 print $out "\n";
451 foreach my $k ( sort keys %{$counthashR} )
452 {
453 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
454
455 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
456 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
457
458 if ($piRNA_number != 0 )
459 {
460 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number;
461 printf $out "\t%.2f",$pirna;
462 }
463 if ($miRNA_number != 0 )
464 {
465 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number;
466 printf $out "\t%.2f",$mirna;
467 }
468 if ($bonafide_number != 0 )
469 {
470 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number;
471 printf $out "\t%.2f",$bonafide;
472 }
473 print $out "\n";
474 }
475 close $out;
476 }
477
478 sub extract_sam
479 {
480 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
365 481
366 $rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; 482 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
367 print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm; 483
368 484 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n";
369 if ($piRNA_number != 0 ) 485 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n";
370 {
371 $pirna = ( $counthashR->{$k}->[0] * 1000000) / $piRNA_number;
372 printf $out "\t%.2f",$pirna;
373 }
374 if ($miRNA_number != 0 )
375 {
376 $mirna = ( $counthashR->{$k}->[0] * 1000000) / $miRNA_number;
377 printf $out "\t%.2f",$mirna;
378 }
379 if ($bonafide_number != 0 )
380 {
381 $bonafide = ( $counthashR->{$k}->[0] * 1000000) / $bonafide_number;
382 printf $out "\t%.2f",$bonafide;
383 }
384
385 print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ;
386 my $S1U = 0;
387 $S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
388 my $R1U = 0;
389 $R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
390 my $S10A = 0;
391 $S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
392 my $R10A = 0;
393 $R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
394
395 print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A;
396
397 print $out "\n";
398 }
399 close $out;
400 }
401
402
403 sub sam_to_bam_bg
404 {
405 my ( $sam, $scale, $number_of_cpus ) = @_;
406 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
407 if ( $sam =~ /(.*?).sam$/ )
408 {
409 $bam_sorted = $1.'_sorted.bam';
410 $bedgraphP= $1.'_plus.bedgraph';
411 $bedgraphM = $1.'_minus.bedgraph';
412 $view_err = $1.'_view.err';
413 $sort_err = $1.'_sort.err';
414 }
415 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`;
416 `bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`;
417 `bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`;
418 }
419
420 sub sam_sorted_bam
421 {
422 my ( $sam, $number_of_cpus ) = @_;
423 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
424 if ( $sam =~ /(.*?).sam$/ )
425 {
426 $bam_sorted = $1.'_sorted.bam';
427 $view_err = $1.'_view.err';
428 $sort_err = $1.'_sort.err';
429
430 }
431 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`;
432 }
433
434 sub BWA_call
435 {
436 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
437 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
438 print $report "-----------------------------\n";
439 print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n";
440 `bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `;
441 }
442
443 sub rpms_rpkm
444 {
445 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
446 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
447 print $out "ID\treads counts\tRPKM";
448 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
449 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
450 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
451 print $out "\n";
452 foreach my $k ( sort keys %{$counthashR} )
453 {
454 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
455 486
456 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; 487 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
457 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm; 488 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
458 489
459 if ($piRNA_number != 0 ) 490 my $sequence = '';
460 { 491 while(<$s_in>)
461 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number; 492 {
462 printf $out "\t%.2f",$pirna; 493 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
463 } 494 {
464 if ($miRNA_number != 0 ) 495 print $s_out $_ if defined ($hashRef);
465 { 496 print $s_uni_out $_;
466 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number; 497 next;
467 printf $out "\t%.2f",$mirna; 498 }
468 } 499 my @line = split (/\t/,$_);
469 if ($bonafide_number != 0 ) 500 $sequence = $line[0];
470 { 501 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )
471 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number; 502 {
472 printf $out "\t%.2f",$bonafide; 503 my $arn = $line[9];
473 } 504 if ($line[1] & 16)
474 print $out "\n"; 505 {
475 } 506 $arn =reverse($arn);
476 close $out; 507 $arn =~ tr/atgcuATGCU/tacgaTACGA/;
477 } 508 }
478 509
479 sub extract_sam 510 if ( ( $line[1] == 16 || $line[1] == 0 ) )
480 { 511 {
481 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_; 512 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
482 513 print $s_out $_ if defined ($hashRef);
483 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n"; 514 if ( $line[11] eq "XT:A:U" )
484 515 {
485 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; 516 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
486 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; 517 print $s_uni_out $_ ;
487 518 }
488 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef); 519 }
489 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n"; 520 }
490 521 }
491 my $sequence = ''; 522 close $s_in; close $s_out if defined ($hashRef);
492 while(<$s_in>) 523 close $s_uni_out; close $f_out; close $f_uni_out;
493 {
494 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
495 {
496 print $s_out $_ if defined ($hashRef);
497 print $s_uni_out $_;
498 next;
499 }
500 my @line = split (/\t/,$_);
501 $sequence = $line[0];
502 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )
503 {
504 my $arn = $line[9];
505 if ($line[1] & 16)
506 {
507 $arn =reverse($arn);
508 $arn =~ tr/atgcuATGCU/tacgaTACGA/;
509 }
510
511 if ( ( $line[1] == 16 || $line[1] == 0 ) )
512 {
513 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
514 print $s_out $_ if defined ($hashRef);
515 if ( $line[11] eq "XT:A:U" )
516 {
517 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
518 print $s_uni_out $_ ;
519 }
520 }
521 }
522 }
523 close $s_in; close $s_out if defined ($hashRef);
524 close $s_uni_out; close $f_out; close $f_uni_out;
525 } 524 }
526 525
527 sub get_fastq_seq 526 sub get_fastq_seq
528 { 527 {
529 my $fastq = shift; 528 my $fastq = shift;
530 my %hash; my $cmp = 0; 529 my %hash; my $cmp = 0;
531 530
532 open my $fic, '<', $fastq || die "cannot open input file $! \n"; 531 open my $fic, '<', $fastq || die "cannot open input file $! \n";
533 while(<$fic>) 532 while(<$fic>)
534 { 533 {
535 chomp $_; 534 chomp $_;
536 $cmp++; 535 $cmp++;
537 if ($cmp % 4 == 1) 536 if ($cmp % 4 == 1)
538 { 537 {
539 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); 538 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
540 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;} 539 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;}
541 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;} 540 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;}
542 } 541 }
543 elsif ($cmp % 4 == 3 ) 542 elsif ($cmp % 4 == 3 )
544 { 543 {
545 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; 544 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
546 } 545 }
547 } 546 }
548 close $fic; 547 close $fic;
549 return \%hash; 548 return \%hash;
550 } 549 }
551 550
552 1; 551 1;