1
|
1 package align;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use File::Basename;
|
|
6 use String::Random;
|
|
7
|
|
8 use FindBin;
|
|
9 use lib $FindBin::Bin;
|
|
10 use Rcall qw ( histogram );
|
|
11
|
|
12 use Exporter;
|
|
13 our @ISA = qw( Exporter );
|
20
|
14 our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam );
|
1
|
15
|
|
16 sub to_build
|
|
17 {
|
|
18 my ( $toBuildTabP, $log, $newdir ) = @_;
|
|
19
|
|
20 foreach my $pairs ( @{ $toBuildTabP } )
|
|
21 {
|
|
22 if ( $pairs->[0] == 1 )
|
|
23 {
|
|
24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa';
|
|
25 symlink( ${$pairs->[1]}, $sym );
|
|
26 ${$pairs->[1]} = $sym;
|
|
27 build_index ( ${$pairs->[1]}, $log );
|
|
28 }
|
|
29 }
|
|
30 }
|
|
31
|
|
32 sub build_index
|
|
33 {
|
|
34 my $to_index = shift;
|
|
35 my $log = shift;
|
|
36 my $index_log = $to_index.'_index.err';
|
|
37 `bwa index $to_index 2> $index_log`;
|
|
38 print $log "Creating index for $to_index\n";
|
|
39 }
|
|
40
|
|
41 sub get_unique
|
|
42 {
|
|
43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_;
|
|
44
|
|
45 my $fout = $col_prefix.'_all_mappers.fastq';
|
|
46 my $funi = $col_prefix.'_unique_mappers.fastq';
|
|
47 my $frej = $col_prefix.'_unmapped.fastq';
|
|
48
|
|
49 my $repartition = $out_prefix.'distribution.txt';
|
|
50 my $png_rep = $out_prefix.'distribution.png';
|
|
51 my ( %duplicates, %genome_hits) ;
|
|
52
|
|
53 #alignement to the first reference
|
|
54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report );
|
|
55 my $ref_fai = $return[4];
|
|
56 my $mappers = $return[5];
|
|
57 my $mappers_uni = $return[6];
|
|
58 my $size_mappedHashR = $return[7];
|
|
59
|
|
60 if ( $details == 1 )
|
|
61 {
|
|
62 #print number of duplicates and hits number
|
|
63 my ($pourcentage, $total) =(0,0);
|
|
64
|
|
65 $total += $_ foreach values %{$size_mappedHashR};
|
|
66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n";
|
|
67 print $rep "size\tnumber\tpercentage\n";
|
|
68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR}))
|
|
69 {
|
|
70 $pourcentage = 0;
|
|
71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0;
|
|
72
|
|
73 print $rep "$k\t$size_mappedHashR->{$k}\t";
|
|
74 printf $rep "%.2f\n",$pourcentage;
|
|
75 }
|
|
76
|
|
77 histogram($size_mappedHashR, $png_rep, $total);
|
|
78
|
|
79
|
|
80 my $dup = $out_prefix.'dup_mapnum.txt';
|
|
81 my $dup_u = $out_prefix .'dup_unique.txt';
|
|
82 my $dup_r = $out_prefix .'dup_nonmapp.txt';
|
|
83 open(my $tab,">".$dup) || die "cannot open output txt file\n";
|
|
84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n";
|
|
85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n";
|
|
86 print $tab "sequence\tcount\tmapnum\n";
|
|
87 print $tab_u "sequence\tcount\n";
|
|
88 print $tab_r "sequence\tcount\n";
|
|
89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates)
|
|
90 {
|
|
91 $duplicates{$k} = 0 unless exists($duplicates{$k});
|
|
92 $genome_hits{$k} = 0 unless exists($genome_hits{$k});
|
|
93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; }
|
|
94 else {print $tab_r $k."\t".$duplicates{$k}."\n";}
|
|
95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; }
|
|
96 }
|
|
97 close $dup; close $dup_r; close $dup_u;
|
|
98 }
|
|
99 return ( $ref_fai, $mappers, $mappers_uni );
|
|
100 }
|
|
101
|
|
102 sub sam_parse
|
|
103 {
|
|
104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ;
|
|
105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size);
|
|
106 $mappers = $mappersUnique = $reads = 0;
|
|
107
|
|
108 open my $fic, '<', $sam || die "cannot open $sam $!\n";
|
|
109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n";
|
|
110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n";
|
|
111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n";
|
|
112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n";
|
|
113 my $sequence = '';
|
|
114 while(<$fic>)
|
|
115 {
|
|
116 chomp $_;
|
|
117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
118 {
|
|
119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
|
|
120 {
|
|
121 $size{$1} = $2;
|
|
122 $unique_number{$1} = 0;
|
|
123 $number{$1} = 0;
|
|
124 $numberNM{$1} = 0;
|
|
125 $numberM{$1} = 0;
|
|
126 }
|
|
127 print $sam_uni $_."\n";
|
|
128 next;
|
|
129 }
|
|
130 $reads++;
|
|
131 my @line = split (/\t/,$_);
|
|
132 $sequence = $line[9];
|
|
133 if ($line[1] & 16)
|
|
134 {
|
|
135 $sequence =reverse($sequence);
|
|
136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
137 }
|
|
138 if ($line[1] == 16 || $line[1] == 0)
|
|
139 {
|
|
140 my $len = length($sequence);
|
|
141 $size_num{$len} ++;
|
|
142 $size_num_spe{$line[2]}{$len}++;
|
|
143 $mappers ++;
|
|
144
|
|
145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ );
|
|
146 ${$duplicate_hashR}{$sequence}++;
|
|
147 $number{$line[2]}++;
|
|
148 $numberSens{$line[2]}++ if $line[1] == 0 ;
|
|
149 $numberReverse{$line[2]}++ if $line[1] == 16 ;
|
|
150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
|
|
151
|
|
152 if ($line[11] eq "XT:A:U")
|
|
153 {
|
|
154 $unique_number{$line[2]}++;
|
|
155 $mappersUnique++;
|
|
156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
|
|
157 print $sam_uni $_."\n";
|
|
158 }
|
|
159 if ($_ =~ /.*XM:i:(\d+).*/)
|
|
160 {
|
|
161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;}
|
|
162 }
|
|
163 }
|
|
164 else
|
|
165 {
|
|
166 ${$best_hit_number_hashR}{$sequence} = 0;
|
|
167 ${$duplicate_hashR}{$sequence}++;
|
|
168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
|
|
169 }
|
|
170 }
|
|
171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni;
|
|
172
|
|
173 print $report "Parsing $sam file\n";
|
|
174 print $report "\treads: $reads\n";
|
|
175 print $report "\tmappers: $mappers\n";
|
|
176 print $report "\tunique mappers: $mappersUnique\n";
|
|
177 print $report "-----------------------------\n";
|
|
178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM );
|
|
179 }
|
|
180
|
|
181 sub get_hash_alignment
|
|
182 {
|
|
183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ;
|
|
184 my ($reads, $mappers, $unmapped) = (0,0,0);
|
|
185 my $accep_unique;
|
|
186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report );
|
|
187
|
|
188 open my $fic, '<', $sam || die "cannot open $sam $!\n";
|
|
189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1;
|
|
190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1;
|
|
191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f;
|
|
192 #if ($name eq "snRNAs") {
|
|
193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/;
|
|
194 #}
|
|
195 my $sequence = '';
|
|
196 while(<$fic>)
|
|
197 {
|
|
198 chomp $_;
|
|
199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
200 {
|
|
201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
|
|
202 {
|
|
203 print $fai $1."\t".$2."\n";
|
|
204 }
|
|
205 next;
|
|
206 }
|
|
207 $reads++;
|
|
208 my @line = split (/\t/,$_);
|
|
209 $sequence = $line[9];
|
|
210 if ($line[1] & 16)
|
|
211 {
|
|
212 $sequence =reverse($sequence);
|
|
213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
214 }
|
|
215 if ($line[1] & 16 || $line[1] == 0)
|
|
216 {
|
|
217 $mappers ++;
|
|
218 if ($accept == 1 )
|
|
219 {
|
|
220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n";
|
|
221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U");
|
|
222 }
|
|
223 }
|
|
224 else
|
|
225 {
|
|
226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1;
|
|
227 $unmapped++;
|
|
228 }
|
|
229 }
|
|
230 # close $accep_unique if ($name eq "bonafide_reads");
|
|
231 close $fic;
|
|
232 close $accepted if $accept == 1;
|
|
233 close $rejected if $reject ==1;
|
|
234 close $fai if $fai_f;
|
|
235 print $report "\treads: $reads\n";
|
|
236 print $report "\tmappers: $mappers\n";
|
|
237 print $report "\tunmapped: $unmapped\n";
|
|
238 print $report "-----------------------------\n";
|
|
239 return ($mappers, $unmapped);
|
|
240 }
|
|
241
|
|
242 sub sam_count
|
|
243 {
|
|
244 my $sam = shift;
|
|
245 my ( %number, %size );
|
|
246
|
|
247 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
|
|
248 while(<$fic>)
|
|
249 {
|
|
250 chomp $_;
|
|
251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
252 {
|
|
253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
|
|
254 {
|
|
255 $size{$1} = $2;
|
|
256 $number{$1} = 0;
|
|
257 }
|
|
258 }
|
|
259 else
|
|
260 {
|
|
261 my @line = split (/\t/,$_);
|
|
262 if ( $line[1] & 16 || $line[1] == 0 )
|
|
263 {
|
|
264 $number{$line[2]}++;
|
|
265 }
|
|
266 }
|
|
267 }
|
|
268 close $fic;
|
|
269 return ( \%number, \%size );
|
|
270 }
|
|
271
|
|
272 sub sam_count_mis
|
|
273 {
|
|
274
|
|
275 my $sam = shift;
|
|
276 my ( %number, %numberNM, %numberM, %size);
|
|
277
|
|
278 open my $fic, '<', $sam || die "cannot open $sam file $!\n";
|
|
279 while(<$fic>)
|
|
280 {
|
|
281 chomp $_;
|
|
282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
283 {
|
|
284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/)
|
|
285 {
|
|
286 $size{$1} = $2;
|
20
|
287 $number{$1} = [0,0,0,0,0];
|
|
288 $numberNM{$1} = [0,0,0,0,0];
|
|
289 $numberM{$1} = [0,0,0,0,0];
|
1
|
290 }
|
|
291 }
|
|
292 else
|
|
293 {
|
|
294 my @line = split (/\t/,$_);
|
20
|
295 my @seq = split //, $line[9];
|
|
296 if ( $line[1] == 16 || $line[1] == 0 )
|
1
|
297 {
|
20
|
298 $number{ $line[2] }->[0]++;
|
|
299 if ($line[1] == 0)
|
|
300 {
|
|
301 $number{$line[2]}->[1]++;
|
|
302 $number{$line[2]}->[3]++ if $seq[0] eq 'T';
|
|
303 }
|
|
304 else
|
|
305 {
|
|
306 $number{$line[2]}->[2]++ if $seq[9] eq 'A';
|
|
307 }
|
1
|
308 if ($_ =~ /.*XM:i:(\d+).*/)
|
|
309 {
|
20
|
310 if ( $1 == 0 )
|
|
311 {
|
|
312 $numberNM{$line[2]}->[0]++;
|
|
313 if ($line[1] == 0)
|
|
314 {
|
|
315 $numberNM{$line[2]}->[1]++;
|
|
316 $numberNM{$line[2]}->[3]++ if $seq[0] eq 'T';
|
|
317 }
|
|
318 else
|
|
319 {
|
|
320 $numberNM{$line[2]}->[2]++;
|
|
321 $numberNM{$line[2]}->[4]++ if $seq[9] eq 'A';
|
|
322 }
|
|
323 }
|
|
324 else
|
|
325 {
|
|
326 $numberM{$line[2]}->[0]++;
|
|
327 if ($line[1] == 0)
|
|
328 {
|
|
329 $numberM{$line[2]}->[1]++;
|
|
330 $numberM{$line[2]}->[3]++ if $seq[0] eq 'T';
|
|
331 }
|
|
332 else
|
|
333 {
|
|
334 $numberM{$line[2]}->[2]++;
|
22
|
335 $numberM{$line[2]}->[4]++ if $seq[9] eq 'A';
|
20
|
336 }
|
|
337 }
|
1
|
338 }
|
|
339 }
|
|
340 }
|
|
341 }
|
|
342 return (\%number, \%size, \%numberNM, \%numberM );
|
|
343 }
|
|
344
|
20
|
345 sub rpms_rpkm_te
|
|
346 {
|
|
347 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
|
|
348 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
|
|
349 print $out "ID\treads counts\tRPKM\tsens reads counts\treverse reads counts\t% of sens 1U\t% of reverse 10A";
|
|
350 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
|
|
351 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
|
|
352 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
|
|
353 print $out "\n";
|
|
354 foreach my $k ( sort keys %{$counthashR} )
|
|
355 {
|
|
356 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
|
|
357
|
|
358 $rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
|
|
359 print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm;
|
|
360 print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ;
|
|
361 my $U1 = 0;
|
|
362 $U1 = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0;
|
|
363 my $A10 = 0;
|
|
364 $A10 = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0;
|
|
365
|
|
366 print $out "\t".$U1."\t".$A10;
|
|
367
|
|
368 if ($piRNA_number != 0 )
|
|
369 {
|
|
370 $pirna = ( $counthashR->{$k}->[0] * 1000000) / $piRNA_number;
|
|
371 printf $out "\t%.2f",$pirna;
|
|
372 }
|
|
373 if ($miRNA_number != 0 )
|
|
374 {
|
|
375 $mirna = ( $counthashR->{$k}->[0] * 1000000) / $miRNA_number;
|
|
376 printf $out "\t%.2f",$mirna;
|
|
377 }
|
|
378 if ($bonafide_number != 0 )
|
|
379 {
|
|
380 $bonafide = ( $counthashR->{$k}->[0] * 1000000) / $bonafide_number;
|
|
381 printf $out "\t%.2f",$bonafide;
|
|
382 }
|
|
383 print $out "\n";
|
|
384 }
|
|
385 close $out;
|
|
386 }
|
|
387
|
|
388
|
1
|
389 sub sam_to_bam_bg
|
|
390 {
|
|
391 my ( $sam, $scale, $number_of_cpus ) = @_;
|
|
392 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' );
|
|
393 if ( $sam =~ /(.*?).sam$/ )
|
|
394 {
|
|
395 $bam_sorted = $1.'_sorted.bam';
|
|
396 $bedgraphP= $1.'_plus.bedgraph';
|
|
397 $bedgraphM = $1.'_minus.bedgraph';
|
|
398 $view_err = $1.'_view.err';
|
|
399 $sort_err = $1.'_sort.err';
|
|
400 }
|
|
401 `samtools view -Shb --threads $number_of_cpus $sam 2> $view_err | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> $sort_err > $bam_sorted`;
|
|
402 `bedtools genomecov -scale $scale -strand + -bga -ibam $bam_sorted > $bedgraphP`;
|
|
403 `bedtools genomecov -scale $scale -strand - -bga -ibam $bam_sorted > $bedgraphM`;
|
|
404 }
|
|
405
|
|
406 sub sam_sorted_bam
|
|
407 {
|
|
408 my ( $sam, $number_of_cpus ) = @_;
|
|
409 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' );
|
|
410 if ( $sam =~ /(.*?).sam$/ )
|
|
411 {
|
|
412 $bam_sorted = $1.'_sorted.bam';
|
|
413 $view_err = $1.'_view.err';
|
|
414 $sort_err = $1.'_sort.err';
|
|
415
|
|
416 }
|
|
417 `samtools view -Shb --threads $number_of_cpus $sam 2> $view_err | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> $sort_err > $bam_sorted`;
|
|
418 }
|
|
419
|
|
420 sub BWA_call
|
|
421 {
|
|
422 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_;
|
|
423 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 );
|
|
424 print $report "-----------------------------\n";
|
|
425 print $report "bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam\n";
|
|
426 `bwa aln -t $number_of_cpus -n $mismatches $index $fastq 2> $aln_err | bwa samse $index /dev/stdin $fastq 2> $samse_err > $sam `;
|
|
427 }
|
|
428
|
|
429 sub rpms_rpkm
|
|
430 {
|
|
431 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_;
|
|
432 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n";
|
|
433 print $out "ID\treads counts\tRPKM";
|
|
434 print $out "\tper million of piRNAs" if ($piRNA_number != 0);
|
|
435 print $out "\tper million of miRNAs" if ($miRNA_number != 0);
|
|
436 print $out "\tper million of bonafide reads" if ($bonafide_number != 0);
|
|
437 print $out "\n";
|
|
438 foreach my $k ( sort keys %{$counthashR} )
|
|
439 {
|
|
440 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0);
|
|
441
|
|
442 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ;
|
|
443 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm;
|
|
444
|
|
445 if ($piRNA_number != 0 )
|
|
446 {
|
|
447 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number;
|
|
448 printf $out "\t%.2f",$pirna;
|
|
449 }
|
|
450 if ($miRNA_number != 0 )
|
|
451 {
|
|
452 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number;
|
|
453 printf $out "\t%.2f",$mirna;
|
|
454 }
|
|
455 if ($bonafide_number != 0 )
|
|
456 {
|
|
457 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number;
|
|
458 printf $out "\t%.2f",$bonafide;
|
|
459 }
|
|
460 print $out "\n";
|
|
461 }
|
|
462 close $out;
|
|
463 }
|
|
464
|
|
465 sub extract_sam
|
|
466 {
|
|
467 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_;
|
|
468
|
|
469 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n";
|
|
470
|
|
471 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n";
|
|
472 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n";
|
|
473
|
|
474 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef);
|
|
475 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n";
|
|
476
|
|
477 my $sequence = '';
|
|
478 while(<$s_in>)
|
|
479 {
|
|
480 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
481 {
|
|
482 print $s_out $_ if defined ($hashRef);
|
|
483 print $s_uni_out $_;
|
|
484 next;
|
|
485 }
|
|
486 my @line = split (/\t/,$_);
|
|
487 $sequence = $line[0];
|
|
488 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) )
|
|
489 {
|
|
490 my $arn = $line[9];
|
|
491 if ($line[1] & 16)
|
|
492 {
|
|
493 $arn =reverse($arn);
|
|
494 $arn =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
495 }
|
|
496
|
|
497 if ( ( $line[1] == 16 || $line[1] == 0 ) )
|
|
498 {
|
|
499 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
|
|
500 print $s_out $_ if defined ($hashRef);
|
|
501 if ( $line[11] eq "XT:A:U" )
|
|
502 {
|
|
503 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ;
|
|
504 print $s_uni_out $_ ;
|
|
505 }
|
|
506 }
|
|
507 }
|
|
508 }
|
|
509 close $s_in; close $s_out if defined ($hashRef);
|
|
510 close $s_uni_out; close $f_out; close $f_uni_out;
|
|
511 }
|
|
512
|
|
513 sub get_fastq_seq
|
|
514 {
|
|
515 my $fastq = shift;
|
|
516 my %hash; my $cmp = 0;
|
|
517
|
|
518 open my $fic, '<', $fastq || die "cannot open input file $! \n";
|
|
519 while(<$fic>)
|
|
520 {
|
|
521 chomp $_;
|
|
522 $cmp++;
|
|
523 if ($cmp % 4 == 1)
|
|
524 {
|
|
525 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ );
|
|
526 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;}
|
|
527 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;}
|
|
528 }
|
|
529 elsif ($cmp % 4 == 3 )
|
|
530 {
|
|
531 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/;
|
|
532 }
|
|
533 }
|
|
534 close $fic;
|
|
535 return \%hash;
|
|
536 }
|
|
537
|
|
538 1;
|