Mercurial > repos > brasset_jensen > srnapipe
comparison bin/align.pm @ 1:1df6aaac800e draft
Deleted selected files
author | brasset_jensen |
---|---|
date | Wed, 13 Dec 2017 10:40:50 -0500 |
parents | |
children | 8ea13dab3435 |
comparison
equal
deleted
inserted
replaced
0:e4e71401c577 | 1:1df6aaac800e |
---|---|
1 package align; | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use File::Basename; | |
6 use String::Random; | |
7 | |
8 use FindBin; | |
9 use lib $FindBin::Bin; | |
10 use Rcall qw ( histogram ); | |
11 | |
12 use Exporter; | |
13 our @ISA = qw( Exporter ); | |
14 our @EXPORT = qw( &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam ); | |
15 | |
16 sub to_build | |
17 { | |
18 my ( $toBuildTabP, $log, $newdir ) = @_; | |
19 | |
20 foreach my $pairs ( @{ $toBuildTabP } ) | |
21 { | |
22 if ( $pairs->[0] == 1 ) | |
23 { | |
24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa'; | |
25 symlink( ${$pairs->[1]}, $sym ); | |
26 ${$pairs->[1]} = $sym; | |
27 build_index ( ${$pairs->[1]}, $log ); | |
28 } | |
29 } | |
30 } | |
31 | |
32 sub build_index | |
33 { | |
34 my $to_index = shift; | |
35 my $log = shift; | |
36 my $index_log = $to_index.'_index.err'; | |
37 `bwa index $to_index 2> $index_log`; | |
38 print $log "Creating index for $to_index\n"; | |
39 } | |
40 | |
41 sub get_unique | |
42 { | |
43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_; | |
44 | |
45 my $fout = $col_prefix.'_all_mappers.fastq'; | |
46 my $funi = $col_prefix.'_unique_mappers.fastq'; | |
47 my $frej = $col_prefix.'_unmapped.fastq'; | |
48 | |
49 my $repartition = $out_prefix.'distribution.txt'; | |
50 my $png_rep = $out_prefix.'distribution.png'; | |
51 my ( %duplicates, %genome_hits) ; | |
52 | |
53 #alignement to the first reference | |
54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report ); | |
55 my $ref_fai = $return[4]; | |
56 my $mappers = $return[5]; | |
57 my $mappers_uni = $return[6]; | |
58 my $size_mappedHashR = $return[7]; | |
59 | |
60 if ( $details == 1 ) | |
61 { | |
62 #print number of duplicates and hits number | |
63 my ($pourcentage, $total) =(0,0); | |
64 | |
65 $total += $_ foreach values %{$size_mappedHashR}; | |
66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n"; | |
67 print $rep "size\tnumber\tpercentage\n"; | |
68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR})) | |
69 { | |
70 $pourcentage = 0; | |
71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0; | |
72 | |
73 print $rep "$k\t$size_mappedHashR->{$k}\t"; | |
74 printf $rep "%.2f\n",$pourcentage; | |
75 } | |
76 | |
77 histogram($size_mappedHashR, $png_rep, $total); | |
78 | |
79 | |
80 my $dup = $out_prefix.'dup_mapnum.txt'; | |
81 my $dup_u = $out_prefix .'dup_unique.txt'; | |
82 my $dup_r = $out_prefix .'dup_nonmapp.txt'; | |
83 open(my $tab,">".$dup) || die "cannot open output txt file\n"; | |
84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n"; | |
85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n"; | |
86 print $tab "sequence\tcount\tmapnum\n"; | |
87 print $tab_u "sequence\tcount\n"; | |
88 print $tab_r "sequence\tcount\n"; | |
89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates) | |
90 { | |
91 $duplicates{$k} = 0 unless exists($duplicates{$k}); | |
92 $genome_hits{$k} = 0 unless exists($genome_hits{$k}); | |
93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; } | |
94 else {print $tab_r $k."\t".$duplicates{$k}."\n";} | |
95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; } | |
96 } | |
97 close $dup; close $dup_r; close $dup_u; | |
98 } | |
99 return ( $ref_fai, $mappers, $mappers_uni ); | |
100 } | |
101 | |
102 sub sam_parse | |
103 { | |
104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ; | |
105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size); | |
106 $mappers = $mappersUnique = $reads = 0; | |
107 | |
108 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | |
109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n"; | |
110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n"; | |
111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n"; | |
112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n"; | |
113 my $sequence = ''; | |
114 while(<$fic>) | |
115 { | |
116 chomp $_; | |
117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
118 { | |
119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
120 { | |
121 $size{$1} = $2; | |
122 $unique_number{$1} = 0; | |
123 $number{$1} = 0; | |
124 $numberNM{$1} = 0; | |
125 $numberM{$1} = 0; | |
126 } | |
127 print $sam_uni $_."\n"; | |
128 next; | |
129 } | |
130 $reads++; | |
131 my @line = split (/\t/,$_); | |
132 $sequence = $line[9]; | |
133 if ($line[1] & 16) | |
134 { | |
135 $sequence =reverse($sequence); | |
136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | |
137 } | |
138 if ($line[1] == 16 || $line[1] == 0) | |
139 { | |
140 my $len = length($sequence); | |
141 $size_num{$len} ++; | |
142 $size_num_spe{$line[2]}{$len}++; | |
143 $mappers ++; | |
144 | |
145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ ); | |
146 ${$duplicate_hashR}{$sequence}++; | |
147 $number{$line[2]}++; | |
148 $numberSens{$line[2]}++ if $line[1] == 0 ; | |
149 $numberReverse{$line[2]}++ if $line[1] == 16 ; | |
150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
151 | |
152 if ($line[11] eq "XT:A:U") | |
153 { | |
154 $unique_number{$line[2]}++; | |
155 $mappersUnique++; | |
156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
157 print $sam_uni $_."\n"; | |
158 } | |
159 if ($_ =~ /.*XM:i:(\d+).*/) | |
160 { | |
161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;} | |
162 } | |
163 } | |
164 else | |
165 { | |
166 ${$best_hit_number_hashR}{$sequence} = 0; | |
167 ${$duplicate_hashR}{$sequence}++; | |
168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
169 } | |
170 } | |
171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni; | |
172 | |
173 print $report "Parsing $sam file\n"; | |
174 print $report "\treads: $reads\n"; | |
175 print $report "\tmappers: $mappers\n"; | |
176 print $report "\tunique mappers: $mappersUnique\n"; | |
177 print $report "-----------------------------\n"; | |
178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM ); | |
179 } | |
180 | |
181 sub get_hash_alignment | |
182 { | |
183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ; | |
184 my ($reads, $mappers, $unmapped) = (0,0,0); | |
185 my $accep_unique; | |
186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ); | |
187 | |
188 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | |
189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1; | |
190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1; | |
191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f; | |
192 #if ($name eq "snRNAs") { | |
193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/; | |
194 #} | |
195 my $sequence = ''; | |
196 while(<$fic>) | |
197 { | |
198 chomp $_; | |
199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
200 { | |
201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
202 { | |
203 print $fai $1."\t".$2."\n"; | |
204 } | |
205 next; | |
206 } | |
207 $reads++; | |
208 my @line = split (/\t/,$_); | |
209 $sequence = $line[9]; | |
210 if ($line[1] & 16) | |
211 { | |
212 $sequence =reverse($sequence); | |
213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | |
214 } | |
215 if ($line[1] & 16 || $line[1] == 0) | |
216 { | |
217 $mappers ++; | |
218 if ($accept == 1 ) | |
219 { | |
220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | |
221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); | |
222 } | |
223 } | |
224 else | |
225 { | |
226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1; | |
227 $unmapped++; | |
228 } | |
229 } | |
230 # close $accep_unique if ($name eq "bonafide_reads"); | |
231 close $fic; | |
232 close $accepted if $accept == 1; | |
233 close $rejected if $reject ==1; | |
234 close $fai if $fai_f; | |
235 print $report "\treads: $reads\n"; | |
236 print $report "\tmappers: $mappers\n"; | |
237 print $report "\tunmapped: $unmapped\n"; | |
238 print $report "-----------------------------\n"; | |
239 return ($mappers, $unmapped); | |
240 } | |
241 | |
242 sub sam_count | |
243 { | |
244 my $sam = shift; | |
245 my ( %number, %size ); | |
246 | |
247 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | |
248 while(<$fic>) | |
249 { | |
250 chomp $_; | |
251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
252 { | |
253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
254 { | |
255 $size{$1} = $2; | |
256 $number{$1} = 0; | |
257 } | |
258 } | |
259 else | |
260 { | |
261 my @line = split (/\t/,$_); | |
262 if ( $line[1] & 16 || $line[1] == 0 ) | |
263 { | |
264 $number{$line[2]}++; | |
265 } | |
266 } | |
267 } | |
268 close $fic; | |
269 return ( \%number, \%size ); | |
270 } | |
271 | |
272 sub sam_count_mis | |
273 { | |
274 | |
275 my $sam = shift; | |
276 my ( %number, %numberNM, %numberM, %size); | |
277 | |
278 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | |
279 while(<$fic>) | |
280 { | |
281 chomp $_; | |
282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
283 { | |
284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
285 { | |
286 $size{$1} = $2; | |
287 $number{$1} = 0; | |
288 $numberNM{$1} = 0; | |
289 $numberM{$1} = 0; | |
290 } | |
291 } | |
292 else | |
293 { | |
294 my @line = split (/\t/,$_); | |
295 if ( $line[1] & 16 || $line[1] == 0 ) | |
296 { | |
297 $number{ $line[2] }++; | |
298 if ($_ =~ /.*XM:i:(\d+).*/) | |
299 { | |
300 if ( $1 == 0 ){ $numberNM{$line[2]}++; } else { $numberM{$line[2]}++; } | |
301 } | |
302 } | |
303 } | |
304 } | |
305 return (\%number, \%size, \%numberNM, \%numberM ); | |
306 } | |
307 | |
308 sub sam_to_bam_bg | |
309 { | |
310 my ( $sam, $scale, $number_of_cpus ) = @_; | |
311 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' ); | |
312 if ( $sam =~ /(.*?).sam$/ ) | |
313 { | |
314 $bam_sorted = $1.'_sorted.bam'; | |
315 $bedgraphP= $1.'_plus.bedgraph'; | |
316 $bedgraphM = $1.'_minus.bedgraph'; | |
317 $view_err = $1.'_view.err'; | |
318 $sort_err = $1.'_sort.err'; | |
319 } | |
320 `samtools view -Shb --threads $number_of_cpus $sam 2> $view_err | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> $sort_err > $bam_sorted`; | |
321 `bedtools genomecov -scale $scale -strand + -bga -ibam $bam_sorted > $bedgraphP`; | |
322 `bedtools genomecov -scale $scale -strand - -bga -ibam $bam_sorted > $bedgraphM`; | |
323 } | |
324 | |
325 sub sam_sorted_bam | |
326 { | |
327 my ( $sam, $number_of_cpus ) = @_; | |
328 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' ); | |
329 if ( $sam =~ /(.*?).sam$/ ) | |
330 { | |
331 $bam_sorted = $1.'_sorted.bam'; | |
332 $view_err = $1.'_view.err'; | |
333 $sort_err = $1.'_sort.err'; | |
334 | |
335 } | |
336 `samtools view -Shb --threads $number_of_cpus $sam 2> $view_err | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> $sort_err > $bam_sorted`; | |
337 } | |
338 | |
339 sub BWA_call | |
340 { | |
341 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_; | |
342 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 ); | |
343 print $report "-----------------------------\n"; | |
344 print $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\n"; | |
345 `bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `; | |
346 } | |
347 | |
348 sub rpms_rpkm | |
349 { | |
350 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; | |
351 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; | |
352 print $out "ID\treads counts\tRPKM"; | |
353 print $out "\tper million of piRNAs" if ($piRNA_number != 0); | |
354 print $out "\tper million of miRNAs" if ($miRNA_number != 0); | |
355 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); | |
356 print $out "\n"; | |
357 foreach my $k ( sort keys %{$counthashR} ) | |
358 { | |
359 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); | |
360 | |
361 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | |
362 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm; | |
363 | |
364 if ($piRNA_number != 0 ) | |
365 { | |
366 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number; | |
367 printf $out "\t%.2f",$pirna; | |
368 } | |
369 if ($miRNA_number != 0 ) | |
370 { | |
371 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number; | |
372 printf $out "\t%.2f",$mirna; | |
373 } | |
374 if ($bonafide_number != 0 ) | |
375 { | |
376 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number; | |
377 printf $out "\t%.2f",$bonafide; | |
378 } | |
379 print $out "\n"; | |
380 } | |
381 close $out; | |
382 } | |
383 | |
384 sub extract_sam | |
385 { | |
386 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_; | |
387 | |
388 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n"; | |
389 | |
390 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; | |
391 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; | |
392 | |
393 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef); | |
394 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n"; | |
395 | |
396 my $sequence = ''; | |
397 while(<$s_in>) | |
398 { | |
399 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
400 { | |
401 print $s_out $_ if defined ($hashRef); | |
402 print $s_uni_out $_; | |
403 next; | |
404 } | |
405 my @line = split (/\t/,$_); | |
406 $sequence = $line[0]; | |
407 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) ) | |
408 { | |
409 my $arn = $line[9]; | |
410 if ($line[1] & 16) | |
411 { | |
412 $arn =reverse($arn); | |
413 $arn =~ tr/atgcuATGCU/tacgaTACGA/; | |
414 } | |
415 | |
416 if ( ( $line[1] == 16 || $line[1] == 0 ) ) | |
417 { | |
418 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
419 print $s_out $_ if defined ($hashRef); | |
420 if ( $line[11] eq "XT:A:U" ) | |
421 { | |
422 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
423 print $s_uni_out $_ ; | |
424 } | |
425 } | |
426 } | |
427 } | |
428 close $s_in; close $s_out if defined ($hashRef); | |
429 close $s_uni_out; close $f_out; close $f_uni_out; | |
430 } | |
431 | |
432 sub get_fastq_seq | |
433 { | |
434 my $fastq = shift; | |
435 my %hash; my $cmp = 0; | |
436 | |
437 open my $fic, '<', $fastq || die "cannot open input file $! \n"; | |
438 while(<$fic>) | |
439 { | |
440 chomp $_; | |
441 $cmp++; | |
442 if ($cmp % 4 == 1) | |
443 { | |
444 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); | |
445 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;} | |
446 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;} | |
447 } | |
448 elsif ($cmp % 4 == 3 ) | |
449 { | |
450 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | |
451 } | |
452 } | |
453 close $fic; | |
454 return \%hash; | |
455 } | |
456 | |
457 1; |