Mercurial > repos > brasset_jensen > srnapipe
comparison bin/align.pm @ 61:9185ca0a7b43 draft
Updated package according to recommendations.
author | pierre.pouchin |
---|---|
date | Wed, 16 Jan 2019 08:18:13 -0500 |
parents | 9645d995fb3c |
children |
comparison
equal
deleted
inserted
replaced
60:9645d995fb3c | 61:9185ca0a7b43 |
---|---|
8 use FindBin; | 8 use FindBin; |
9 use lib $FindBin::Bin; | 9 use lib $FindBin::Bin; |
10 use Rcall qw ( histogram ); | 10 use Rcall qw ( histogram ); |
11 | 11 |
12 use Exporter; | 12 use Exporter; |
13 our @ISA = qw( Exporter ); | 13 our @ISA = qw( Exporter ); |
14 our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam ); | 14 our @EXPORT = qw( &rpms_rpkm_te &BWA_call &to_build &get_unique &sam_sorted_bam &get_hash_alignment &sam_to_bam_bg &sam_count &sam_count_mis &rpms_rpkm &get_fastq_seq &extract_sam ); |
15 | 15 |
16 sub to_build | 16 sub to_build |
17 { | 17 { |
18 my ( $toBuildTabP, $log, $newdir ) = @_; | 18 my ( $toBuildTabP, $log, $newdir ) = @_; |
19 | 19 |
20 foreach my $pairs ( @{ $toBuildTabP } ) | 20 foreach my $pairs ( @{ $toBuildTabP } ) |
21 { | 21 { |
22 if ( $pairs->[0] == 1 ) | 22 if ( $pairs->[0] == 1 ) |
23 { | 23 { |
24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa'; | 24 my $sym = $newdir.basename(${$pairs->[1]}).'_symlink.fa'; |
25 symlink( ${$pairs->[1]}, $sym ); | 25 symlink( ${$pairs->[1]}, $sym ); |
26 ${$pairs->[1]} = $sym; | 26 ${$pairs->[1]} = $sym; |
27 build_index ( ${$pairs->[1]}, $log ); | 27 build_index ( ${$pairs->[1]}, $log ); |
28 } | 28 } |
29 } | 29 } |
30 } | 30 } |
31 | 31 |
32 sub build_index | 32 sub build_index |
33 { | 33 { |
34 my $to_index = shift; | 34 my $to_index = shift; |
35 my $log = shift; | 35 my $log = shift; |
36 my $index_log = $to_index.'_index.err'; | 36 my $index_log = $to_index.'_index.err'; |
37 `bwa index '$to_index' 2> '$index_log'`; | 37 `bwa index '$to_index' 2> '$index_log'`; |
38 print $log "Creating index for $to_index\n"; | 38 print $log "Creating index for $to_index\n"; |
39 } | 39 } |
40 | 40 |
41 sub get_unique | 41 sub get_unique |
42 { | 42 { |
43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_; | 43 my ( $sam, $s_uni, $out_prefix, $col_prefix, $details, $report ) = @_; |
44 | 44 |
45 my $fout = $col_prefix.'_all_mappers.fastq'; | 45 my $fout = $col_prefix.'_all_mappers.fastq'; |
46 my $funi = $col_prefix.'_unique_mappers.fastq'; | 46 my $funi = $col_prefix.'_unique_mappers.fastq'; |
47 my $frej = $col_prefix.'_unmapped.fastq'; | 47 my $frej = $col_prefix.'_unmapped.fastq'; |
48 | 48 |
49 my $repartition = $out_prefix.'distribution.txt'; | 49 my $repartition = $out_prefix.'distribution.txt'; |
50 my $png_rep = $out_prefix.'distribution.png'; | 50 my $png_rep = $out_prefix.'distribution.png'; |
51 my ( %duplicates, %genome_hits) ; | 51 my ( %duplicates, %genome_hits) ; |
52 | 52 |
53 #alignement to the first reference | 53 #alignement to the first reference |
54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report ); | 54 my @return = sam_parse( $sam, $fout, $funi, $frej, $s_uni, \%duplicates, \%genome_hits, $report ); |
55 my $ref_fai = $return[4]; | 55 my $ref_fai = $return[4]; |
56 my $mappers = $return[5]; | 56 my $mappers = $return[5]; |
57 my $mappers_uni = $return[6]; | 57 my $mappers_uni = $return[6]; |
58 my $size_mappedHashR = $return[7]; | 58 my $size_mappedHashR = $return[7]; |
59 | 59 |
60 if ( $details == 1 ) | 60 if ( $details == 1 ) |
61 { | 61 { |
62 #print number of duplicates and hits number | 62 #print number of duplicates and hits number |
63 my ($pourcentage, $total) =(0,0); | 63 my ($pourcentage, $total) =(0,0); |
64 | 64 |
65 $total += $_ foreach values %{$size_mappedHashR}; | 65 $total += $_ foreach values %{$size_mappedHashR}; |
66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n"; | 66 open (my $rep, '>'.$repartition) || die "cannot create $repartition $!\n"; |
67 print $rep "size\tnumber\tpercentage\n"; | 67 print $rep "size\tnumber\tpercentage\n"; |
68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR})) | 68 foreach my $k (sort{$a cmp $b} keys (%{$size_mappedHashR})) |
69 { | 69 { |
70 $pourcentage = 0; | 70 $pourcentage = 0; |
71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0; | 71 $pourcentage = $size_mappedHashR->{$k} / $total * 100 unless $total ==0; |
72 | 72 |
73 print $rep "$k\t$size_mappedHashR->{$k}\t"; | 73 print $rep "$k\t$size_mappedHashR->{$k}\t"; |
74 printf $rep "%.2f\n",$pourcentage; | 74 printf $rep "%.2f\n",$pourcentage; |
75 } | 75 } |
76 | 76 |
77 histogram($size_mappedHashR, $png_rep, $total); | 77 histogram($size_mappedHashR, $png_rep, $total); |
78 | 78 |
79 | 79 |
80 my $dup = $out_prefix.'dup_mapnum.txt'; | 80 my $dup = $out_prefix.'dup_mapnum.txt'; |
81 my $dup_u = $out_prefix .'dup_unique.txt'; | 81 my $dup_u = $out_prefix .'dup_unique.txt'; |
82 my $dup_r = $out_prefix .'dup_nonmapp.txt'; | 82 my $dup_r = $out_prefix .'dup_nonmapp.txt'; |
83 open(my $tab,">".$dup) || die "cannot open output txt file\n"; | 83 open(my $tab,">".$dup) || die "cannot open output txt file\n"; |
84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n"; | 84 open(my $tab_r,">".$dup_r) || die "cannot open output txt file\n"; |
85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n"; | 85 open(my $tab_u,">".$dup_u) || die "cannot open output txt file\n"; |
86 print $tab "sequence\tcount\tmapnum\n"; | 86 print $tab "sequence\tcount\tmapnum\n"; |
87 print $tab_u "sequence\tcount\n"; | 87 print $tab_u "sequence\tcount\n"; |
88 print $tab_r "sequence\tcount\n"; | 88 print $tab_r "sequence\tcount\n"; |
89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates) | 89 foreach my $k (sort {$duplicates{$b} <=> $duplicates{$a}}keys %duplicates) |
90 { | 90 { |
91 $duplicates{$k} = 0 unless exists($duplicates{$k}); | 91 $duplicates{$k} = 0 unless exists($duplicates{$k}); |
92 $genome_hits{$k} = 0 unless exists($genome_hits{$k}); | 92 $genome_hits{$k} = 0 unless exists($genome_hits{$k}); |
93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; } | 93 if ($genome_hits{$k} != 0) { print $tab $k."\t".$duplicates{$k}."\t".$genome_hits{$k}."\n"; } |
94 else {print $tab_r $k."\t".$duplicates{$k}."\n";} | 94 else {print $tab_r $k."\t".$duplicates{$k}."\n";} |
95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; } | 95 if ($genome_hits{$k} == 1) { print $tab_u $k."\t".$duplicates{$k}."\n"; } |
96 } | 96 } |
97 close $dup; close $dup_r; close $dup_u; | 97 close $dup; close $dup_r; close $dup_u; |
98 } | 98 } |
99 return ( $ref_fai, $mappers, $mappers_uni ); | 99 return ( $ref_fai, $mappers, $mappers_uni ); |
100 } | 100 } |
101 | 101 |
102 sub sam_parse | 102 sub sam_parse |
103 { | 103 { |
104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ; | 104 my ( $sam, $fastq_accepted, $fastq_accepted_unique, $fastq_rejected, $sam_unique, $duplicate_hashR, $best_hit_number_hashR, $report ) = @_ ; |
105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size); | 105 my ($reads, $mappers, $mappersUnique, @garbage, %size_num, %size_num_spe, %number, %numberSens, %numberReverse, %unique_number, %numberNM, %numberM, %size); |
106 $mappers = $mappersUnique = $reads = 0; | 106 $mappers = $mappersUnique = $reads = 0; |
107 | 107 |
108 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | 108 open my $fic, '<', $sam || die "cannot open $sam $!\n"; |
109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n"; | 109 open my $accepted, '>', $fastq_accepted || die "cannot create $fastq_accepted $! \n"; |
110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n"; | 110 open my $unique, '>', $fastq_accepted_unique || die "cannot create $fastq_accepted_unique $! \n"; |
111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n"; | 111 open my $rejected, '>', $fastq_rejected || die "cannot create $fastq_rejected $! \n"; |
112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n"; | 112 open my $sam_uni, '>', $sam_unique || die "cannot create $sam_unique $! \n"; |
113 my $sequence = ''; | 113 my $sequence = ''; |
114 while(<$fic>) | 114 while(<$fic>) |
115 { | 115 { |
116 chomp $_; | 116 chomp $_; |
117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | 117 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
118 { | 118 { |
119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | 119 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) |
120 { | 120 { |
121 $size{$1} = $2; | 121 $size{$1} = $2; |
122 $unique_number{$1} = 0; | 122 $unique_number{$1} = 0; |
123 $number{$1} = 0; | 123 $number{$1} = 0; |
124 $numberNM{$1} = 0; | 124 $numberNM{$1} = 0; |
125 $numberM{$1} = 0; | 125 $numberM{$1} = 0; |
126 } | 126 } |
127 print $sam_uni $_."\n"; | 127 print $sam_uni $_."\n"; |
128 next; | 128 next; |
129 } | 129 } |
130 $reads++; | 130 $reads++; |
131 my @line = split (/\t/,$_); | 131 my @line = split (/\t/,$_); |
132 $sequence = $line[9]; | 132 $sequence = $line[9]; |
133 if ($line[1] & 16) | 133 if ($line[1] & 16) |
134 { | 134 { |
135 $sequence =reverse($sequence); | 135 $sequence =reverse($sequence); |
136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | 136 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; |
137 } | 137 } |
138 if ($line[1] == 16 || $line[1] == 0) | 138 if ($line[1] == 16 || $line[1] == 0) |
139 { | 139 { |
140 my $len = length($sequence); | 140 my $len = length($sequence); |
141 $size_num{$len} ++; | 141 $size_num{$len} ++; |
142 $size_num_spe{$line[2]}{$len}++; | 142 $size_num_spe{$line[2]}{$len}++; |
143 $mappers ++; | 143 $mappers ++; |
144 | 144 |
145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ ); | 145 ${$best_hit_number_hashR}{$sequence} = $1 if ($line[13] =~ /X0:i:(\d*)/ || $line[14] =~/X0:i:(\d*)/ ); |
146 ${$duplicate_hashR}{$sequence}++; | 146 ${$duplicate_hashR}{$sequence}++; |
147 $number{$line[2]}++; | 147 $number{$line[2]}++; |
148 $numberSens{$line[2]}++ if $line[1] == 0 ; | 148 $numberSens{$line[2]}++ if $line[1] == 0 ; |
149 $numberReverse{$line[2]}++ if $line[1] == 16 ; | 149 $numberReverse{$line[2]}++ if $line[1] == 16 ; |
150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | 150 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; |
151 | 151 |
152 if ($line[11] eq "XT:A:U") | 152 if ($line[11] eq "XT:A:U") |
153 { | 153 { |
154 $unique_number{$line[2]}++; | 154 $unique_number{$line[2]}++; |
155 $mappersUnique++; | 155 $mappersUnique++; |
156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | 156 print $unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; |
157 print $sam_uni $_."\n"; | 157 print $sam_uni $_."\n"; |
158 } | 158 } |
159 if ($_ =~ /.*XM:i:(\d+).*/) | 159 if ($_ =~ /.*XM:i:(\d+).*/) |
160 { | 160 { |
161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;} | 161 if ($1 == 0){$numberNM{$line[2]}++;}else{$numberM{$line[2]}++;} |
162 } | 162 } |
163 } | 163 } |
164 else | 164 else |
165 { | 165 { |
166 ${$best_hit_number_hashR}{$sequence} = 0; | 166 ${$best_hit_number_hashR}{$sequence} = 0; |
167 ${$duplicate_hashR}{$sequence}++; | 167 ${$duplicate_hashR}{$sequence}++; |
168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | 168 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; |
169 } | 169 } |
170 } | 170 } |
171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni; | 171 close $fic; close $accepted; close $unique; close $rejected; close $sam_uni; |
172 | 172 |
173 print $report "Parsing $sam file\n"; | 173 print $report "Parsing $sam file\n"; |
174 print $report "\treads: $reads\n"; | 174 print $report "\treads: $reads\n"; |
175 print $report "\tmappers: $mappers\n"; | 175 print $report "\tmappers: $mappers\n"; |
176 print $report "\tunique mappers: $mappersUnique\n"; | 176 print $report "\tunique mappers: $mappersUnique\n"; |
177 print $report "-----------------------------\n"; | 177 print $report "-----------------------------\n"; |
178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM ); | 178 return (\%number, \%unique_number, \%numberSens, \%numberReverse, \%size, $mappers, $mappersUnique, \%size_num, \%size_num_spe, \%numberNM, \%numberM ); |
179 } | 179 } |
180 | 180 |
181 sub get_hash_alignment | 181 sub get_hash_alignment |
182 { | 182 { |
183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ; | 183 my ($index, $mismatches, $accept, $reject, $outA, $outR, $fastq, $number_of_cpus, $name, $sam, $report, $fai_f) = @_ ; |
184 my ($reads, $mappers, $unmapped) = (0,0,0); | 184 my ($reads, $mappers, $unmapped) = (0,0,0); |
185 my $accep_unique; | 185 my $accep_unique; |
186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ); | 186 BWA_call ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ); |
187 | 187 |
188 open my $fic, '<', $sam || die "cannot open $sam $!\n"; | 188 open my $fic, '<', $sam || die "cannot open $sam $!\n"; |
189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1; | 189 open my $accepted, '>', $outA || die "cannot open $outA\n" if $accept == 1; |
190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1; | 190 open my $rejected, '>', $outR || die "cannot open $outR\n" if $reject == 1; |
191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f; | 191 open my $fai, '>', $fai_f || die "cannot open $fai_f\n" if $fai_f; |
192 #if ($name eq "snRNAs") { | 192 #if ($name eq "snRNAs") { |
193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/; | 193 # open ( $accep_unique, ">".$1."-unique.fastq") if $outR =~ /(.*)\.fastq/; |
194 #} | 194 #} |
195 my $sequence = ''; | 195 my $sequence = ''; |
196 while(<$fic>) | 196 while(<$fic>) |
197 { | 197 { |
198 chomp $_; | 198 chomp $_; |
199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | 199 if( $_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
200 { | 200 { |
201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | 201 if ($fai_f && $_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) |
202 { | 202 { |
203 print $fai $1."\t".$2."\n"; | 203 print $fai $1."\t".$2."\n"; |
204 } | 204 } |
205 next; | 205 next; |
206 } | 206 } |
207 $reads++; | 207 $reads++; |
208 my @line = split (/\t/,$_); | 208 my @line = split (/\t/,$_); |
209 $sequence = $line[9]; | 209 $sequence = $line[9]; |
210 if ($line[1] & 16) | 210 if ($line[1] & 16) |
211 { | 211 { |
212 $sequence =reverse($sequence); | 212 $sequence =reverse($sequence); |
213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; | 213 $sequence =~ tr/atgcuATGCU/tacgaTACGA/; |
214 } | 214 } |
215 if ($line[1] & 16 || $line[1] == 0) | 215 if ($line[1] & 16 || $line[1] == 0) |
216 { | 216 { |
217 $mappers ++; | 217 $mappers ++; |
218 if ($accept == 1 ) | 218 if ($accept == 1 ) |
219 { | 219 { |
220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; | 220 print $accepted "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n"; |
221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); | 221 # print $accep_unique "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if ($name eq "snRNAs" && $line[11] eq "XT:A:U"); |
222 } | 222 } |
223 } | 223 } |
224 else | 224 else |
225 { | 225 { |
226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1; | 226 print $rejected "\@".$line[0]."\n".$sequence."\n+\n".$line[10]."\n" if $reject == 1; |
227 $unmapped++; | 227 $unmapped++; |
228 } | 228 } |
229 } | 229 } |
230 # close $accep_unique if ($name eq "bonafide_reads"); | 230 # close $accep_unique if ($name eq "bonafide_reads"); |
231 close $fic; | 231 close $fic; |
232 close $accepted if $accept == 1; | 232 close $accepted if $accept == 1; |
233 close $rejected if $reject ==1; | 233 close $rejected if $reject ==1; |
234 close $fai if $fai_f; | 234 close $fai if $fai_f; |
235 print $report "\treads: $reads\n"; | 235 print $report "\treads: $reads\n"; |
236 print $report "\tmappers: $mappers\n"; | 236 print $report "\tmappers: $mappers\n"; |
237 print $report "\tunmapped: $unmapped\n"; | 237 print $report "\tunmapped: $unmapped\n"; |
238 print $report "-----------------------------\n"; | 238 print $report "-----------------------------\n"; |
239 return ($mappers, $unmapped); | 239 return ($mappers, $unmapped); |
240 } | 240 } |
241 | 241 |
242 sub sam_count | 242 sub sam_count |
243 { | 243 { |
244 my $sam = shift; | 244 my $sam = shift; |
245 my ( %number, %size ); | 245 my ( %number, %size ); |
246 | 246 |
247 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | 247 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; |
248 while(<$fic>) | 248 while(<$fic>) |
249 { | 249 { |
250 chomp $_; | 250 chomp $_; |
251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | 251 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
252 { | 252 { |
253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | 253 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) |
254 { | 254 { |
255 $size{$1} = $2; | 255 $size{$1} = $2; |
256 $number{$1} = 0; | 256 $number{$1} = 0; |
257 } | 257 } |
258 } | 258 } |
259 else | 259 else |
260 { | 260 { |
261 my @line = split (/\t/,$_); | 261 my @line = split (/\t/,$_); |
262 if ( $line[1] & 16 || $line[1] == 0 ) | 262 if ( $line[1] & 16 || $line[1] == 0 ) |
263 { | 263 { |
264 $number{$line[2]}++; | 264 $number{$line[2]}++; |
265 } | 265 } |
266 } | |
267 } | |
268 close $fic; | |
269 return ( \%number, \%size ); | |
270 } | |
271 | |
272 sub sam_count_mis | |
273 { | |
274 | |
275 my $sam = shift; | |
276 my ( %number, %numberNM, %numberM, %size); | |
277 | |
278 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; | |
279 while(<$fic>) | |
280 { | |
281 chomp $_; | |
282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
283 { | |
284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) | |
285 { | |
286 $size{$1} = $2; | |
287 $number{$1} = [0,0,0,0,0,0,0]; | |
288 $numberNM{$1} = [0,0,0,0,0,0,0]; | |
289 $numberM{$1} = [0,0,0,0,0,0,0]; | |
290 } | |
291 } | |
292 else | |
293 { | |
294 my @line = split (/\t/,$_); | |
295 my @seq = split //, $line[9]; | |
296 if ( $line[1] == 16 || $line[1] == 0 ) | |
297 { | |
298 $number{ $line[2] }->[0]++; | |
299 if ($line[1] == 0) | |
300 { | |
301 $number{$line[2]}->[1]++; | |
302 $number{$line[2]}->[3]++ if $seq[0] eq 'T'; | |
303 $number{$line[2]}->[5]++ if $seq[9] eq 'A'; | |
304 } | |
305 else | |
306 { | |
307 $number{$line[2]}->[2]++; | |
308 $number{$line[2]}->[4]++ if $seq[9] eq 'A'; | |
309 $number{$line[2]}->[6]++ if $seq[0] eq 'T'; | |
310 } | |
311 if ($_ =~ /.*XM:i:(\d+).*/) | |
312 { | |
313 if ( $1 == 0 ) | |
314 { | |
315 $numberNM{$line[2]}->[0]++; | |
316 if ($line[1] == 0) | |
317 { | |
318 $numberNM{$line[2]}->[1]++; | |
319 $numberNM{$line[2]}->[3]++ if $seq[0] eq 'T'; | |
320 $numberNM{$line[2]}->[5]++ if $seq[9] eq 'A'; | |
321 } | |
322 else | |
323 { | |
324 $numberNM{$line[2]}->[2]++; | |
325 $numberNM{$line[2]}->[4]++ if $seq[9] eq 'A'; | |
326 $numberNM{$line[2]}->[6]++ if $seq[0] eq 'T'; | |
327 } | |
328 } | 266 } |
329 else | 267 } |
330 { | 268 close $fic; |
331 $numberM{$line[2]}->[0]++; | 269 return ( \%number, \%size ); |
332 if ($line[1] == 0) | 270 } |
333 { | 271 |
334 $numberM{$line[2]}->[1]++; | 272 sub sam_count_mis |
335 $numberM{$line[2]}->[3]++ if $seq[0] eq 'T'; | 273 { |
336 $numberM{$line[2]}->[5]++ if $seq[9] eq 'A'; | 274 |
337 } | 275 my $sam = shift; |
338 else | 276 my ( %number, %numberNM, %numberM, %size); |
339 { | 277 |
340 $numberM{$line[2]}->[2]++; | 278 open my $fic, '<', $sam || die "cannot open $sam file $!\n"; |
341 $numberM{$line[2]}->[4]++ if $seq[9] eq 'A'; | 279 while(<$fic>) |
342 $numberM{$line[2]}->[6]++ if $seq[0] eq 'T'; | 280 { |
343 } | 281 chomp $_; |
344 } | 282 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
345 } | 283 { |
346 } | 284 if ($_ =~ /\@SQ\tSN:(.*)\tLN:(\d*)/) |
347 } | 285 { |
348 } | 286 $size{$1} = $2; |
349 return (\%number, \%size, \%numberNM, \%numberM ); | 287 $number{$1} = [0,0,0,0,0,0,0]; |
288 $numberNM{$1} = [0,0,0,0,0,0,0]; | |
289 $numberM{$1} = [0,0,0,0,0,0,0]; | |
290 } | |
291 } | |
292 else | |
293 { | |
294 my @line = split (/\t/,$_); | |
295 my @seq = split //, $line[9]; | |
296 if ( $line[1] == 16 || $line[1] == 0 ) | |
297 { | |
298 $number{ $line[2] }->[0]++; | |
299 if ($line[1] == 0) | |
300 { | |
301 $number{$line[2]}->[1]++; | |
302 $number{$line[2]}->[3]++ if $seq[0] eq 'T'; | |
303 $number{$line[2]}->[5]++ if $seq[9] eq 'A'; | |
304 } | |
305 else | |
306 { | |
307 $number{$line[2]}->[2]++; | |
308 $number{$line[2]}->[4]++ if $seq[9] eq 'A'; | |
309 $number{$line[2]}->[6]++ if $seq[0] eq 'T'; | |
310 } | |
311 if ($_ =~ /.*XM:i:(\d+).*/) | |
312 { | |
313 if ( $1 == 0 ) | |
314 { | |
315 $numberNM{$line[2]}->[0]++; | |
316 if ($line[1] == 0) | |
317 { | |
318 $numberNM{$line[2]}->[1]++; | |
319 $numberNM{$line[2]}->[3]++ if $seq[0] eq 'T'; | |
320 $numberNM{$line[2]}->[5]++ if $seq[9] eq 'A'; | |
321 } | |
322 else | |
323 { | |
324 $numberNM{$line[2]}->[2]++; | |
325 $numberNM{$line[2]}->[4]++ if $seq[9] eq 'A'; | |
326 $numberNM{$line[2]}->[6]++ if $seq[0] eq 'T'; | |
327 } | |
328 } | |
329 else | |
330 { | |
331 $numberM{$line[2]}->[0]++; | |
332 if ($line[1] == 0) | |
333 { | |
334 $numberM{$line[2]}->[1]++; | |
335 $numberM{$line[2]}->[3]++ if $seq[0] eq 'T'; | |
336 $numberM{$line[2]}->[5]++ if $seq[9] eq 'A'; | |
337 } | |
338 else | |
339 { | |
340 $numberM{$line[2]}->[2]++; | |
341 $numberM{$line[2]}->[4]++ if $seq[9] eq 'A'; | |
342 $numberM{$line[2]}->[6]++ if $seq[0] eq 'T'; | |
343 } | |
344 } | |
345 } | |
346 } | |
347 } | |
348 } | |
349 return (\%number, \%size, \%numberNM, \%numberM ); | |
350 } | 350 } |
351 | 351 |
352 sub rpms_rpkm_te | 352 sub rpms_rpkm_te |
353 { | 353 { |
354 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; | 354 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; |
355 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; | 355 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; |
356 print $out "ID\treads counts\tRPKM"; | 356 print $out "ID\treads counts\tRPKM"; |
357 print $out "\tper million of piRNAs" if ($piRNA_number != 0); | 357 print $out "\tper million of piRNAs" if ($piRNA_number != 0); |
358 print $out "\tper million of miRNAs" if ($miRNA_number != 0); | 358 print $out "\tper million of miRNAs" if ($miRNA_number != 0); |
359 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); | 359 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); |
360 print $out "\tsense reads counts\treverse reads counts"; | 360 print $out "\tsense reads counts\treverse reads counts"; |
361 print $out "\t% of sense 1U\t% of sense 10A\t% of reverse 1U\t% of reverse 10A\n"; | 361 print $out "\t% of sense 1U\t% of sense 10A\t% of reverse 1U\t% of reverse 10A\n"; |
362 foreach my $k ( sort keys %{$counthashR} ) | 362 foreach my $k ( sort keys %{$counthashR} ) |
363 { | 363 { |
364 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); | 364 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); |
365 | |
366 $rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | |
367 print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm; | |
368 | |
369 if ($piRNA_number != 0 ) | |
370 { | |
371 $pirna = ( $counthashR->{$k}->[0] * 1000000) / $piRNA_number; | |
372 printf $out "\t%.2f",$pirna; | |
373 } | |
374 if ($miRNA_number != 0 ) | |
375 { | |
376 $mirna = ( $counthashR->{$k}->[0] * 1000000) / $miRNA_number; | |
377 printf $out "\t%.2f",$mirna; | |
378 } | |
379 if ($bonafide_number != 0 ) | |
380 { | |
381 $bonafide = ( $counthashR->{$k}->[0] * 1000000) / $bonafide_number; | |
382 printf $out "\t%.2f",$bonafide; | |
383 } | |
384 | |
385 print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ; | |
386 my $S1U = 0; | |
387 $S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0; | |
388 my $R1U = 0; | |
389 $R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0; | |
390 my $S10A = 0; | |
391 $S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0; | |
392 my $R10A = 0; | |
393 $R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0; | |
394 print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A; | |
395 | |
396 print $out "\n"; | |
397 } | |
398 close $out; | |
399 } | |
400 | |
401 | |
402 sub sam_to_bam_bg | |
403 { | |
404 my ( $sam, $scale, $number_of_cpus ) = @_; | |
405 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' ); | |
406 if ( $sam =~ /(.*?).sam$/ ) | |
407 { | |
408 $bam_sorted = $1.'_sorted.bam'; | |
409 $bedgraphP= $1.'_plus.bedgraph'; | |
410 $bedgraphM = $1.'_minus.bedgraph'; | |
411 $view_err = $1.'_view.err'; | |
412 $sort_err = $1.'_sort.err'; | |
413 } | |
414 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`; | |
415 `bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`; | |
416 `bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`; | |
417 } | |
418 | |
419 sub sam_sorted_bam | |
420 { | |
421 my ( $sam, $number_of_cpus ) = @_; | |
422 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' ); | |
423 if ( $sam =~ /(.*?).sam$/ ) | |
424 { | |
425 $bam_sorted = $1.'_sorted.bam'; | |
426 $view_err = $1.'_view.err'; | |
427 $sort_err = $1.'_sort.err'; | |
428 | |
429 } | |
430 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`; | |
431 } | |
432 | |
433 sub BWA_call | |
434 { | |
435 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_; | |
436 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 ); | |
437 print $report "-----------------------------\n"; | |
438 print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n"; | |
439 `bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `; | |
440 } | |
441 | |
442 sub rpms_rpkm | |
443 { | |
444 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; | |
445 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; | |
446 print $out "ID\treads counts\tRPKM"; | |
447 print $out "\tper million of piRNAs" if ($piRNA_number != 0); | |
448 print $out "\tper million of miRNAs" if ($miRNA_number != 0); | |
449 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); | |
450 print $out "\n"; | |
451 foreach my $k ( sort keys %{$counthashR} ) | |
452 { | |
453 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); | |
454 | |
455 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | |
456 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm; | |
457 | |
458 if ($piRNA_number != 0 ) | |
459 { | |
460 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number; | |
461 printf $out "\t%.2f",$pirna; | |
462 } | |
463 if ($miRNA_number != 0 ) | |
464 { | |
465 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number; | |
466 printf $out "\t%.2f",$mirna; | |
467 } | |
468 if ($bonafide_number != 0 ) | |
469 { | |
470 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number; | |
471 printf $out "\t%.2f",$bonafide; | |
472 } | |
473 print $out "\n"; | |
474 } | |
475 close $out; | |
476 } | |
477 | |
478 sub extract_sam | |
479 { | |
480 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_; | |
365 | 481 |
366 $rpkm = ( $counthashR->{$k}->[0] * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | 482 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n"; |
367 print $out $k."\t".$counthashR->{$k}->[0]."\t"; printf $out "%.2f",$rpkm; | 483 |
368 | 484 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; |
369 if ($piRNA_number != 0 ) | 485 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; |
370 { | |
371 $pirna = ( $counthashR->{$k}->[0] * 1000000) / $piRNA_number; | |
372 printf $out "\t%.2f",$pirna; | |
373 } | |
374 if ($miRNA_number != 0 ) | |
375 { | |
376 $mirna = ( $counthashR->{$k}->[0] * 1000000) / $miRNA_number; | |
377 printf $out "\t%.2f",$mirna; | |
378 } | |
379 if ($bonafide_number != 0 ) | |
380 { | |
381 $bonafide = ( $counthashR->{$k}->[0] * 1000000) / $bonafide_number; | |
382 printf $out "\t%.2f",$bonafide; | |
383 } | |
384 | |
385 print $out "\t".$counthashR->{$k}->[1]."\t".$counthashR->{$k}->[2] ; | |
386 my $S1U = 0; | |
387 $S1U = $counthashR->{$k}->[3] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0; | |
388 my $R1U = 0; | |
389 $R1U = $counthashR->{$k}->[6] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0; | |
390 my $S10A = 0; | |
391 $S10A = $counthashR->{$k}->[5] / $counthashR->{$k}->[1] * 100 if $counthashR->{$k}->[1] != 0; | |
392 my $R10A = 0; | |
393 $R10A = $counthashR->{$k}->[4] / $counthashR->{$k}->[2] * 100 if $counthashR->{$k}->[2] != 0; | |
394 | |
395 print $out "\t".$S1U."\t".$S10A."\t".$R1U."\t".$R10A; | |
396 | |
397 print $out "\n"; | |
398 } | |
399 close $out; | |
400 } | |
401 | |
402 | |
403 sub sam_to_bam_bg | |
404 { | |
405 my ( $sam, $scale, $number_of_cpus ) = @_; | |
406 my ( $bam_sorted, $bedgraphM, $bedgraphP, $view_err, $sort_err ) = ( '', '', '', '', '' ); | |
407 if ( $sam =~ /(.*?).sam$/ ) | |
408 { | |
409 $bam_sorted = $1.'_sorted.bam'; | |
410 $bedgraphP= $1.'_plus.bedgraph'; | |
411 $bedgraphM = $1.'_minus.bedgraph'; | |
412 $view_err = $1.'_view.err'; | |
413 $sort_err = $1.'_sort.err'; | |
414 } | |
415 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`; | |
416 `bedtools genomecov -scale $scale -strand + -bga -ibam '$bam_sorted' > '$bedgraphP'`; | |
417 `bedtools genomecov -scale $scale -strand - -bga -ibam '$bam_sorted' > '$bedgraphM'`; | |
418 } | |
419 | |
420 sub sam_sorted_bam | |
421 { | |
422 my ( $sam, $number_of_cpus ) = @_; | |
423 my ( $bam_sorted, $view_err, $sort_err ) = ( '', '', '' ); | |
424 if ( $sam =~ /(.*?).sam$/ ) | |
425 { | |
426 $bam_sorted = $1.'_sorted.bam'; | |
427 $view_err = $1.'_view.err'; | |
428 $sort_err = $1.'_sort.err'; | |
429 | |
430 } | |
431 `samtools view -Shb --threads $number_of_cpus '$sam' 2> '$view_err' | samtools sort -O BAM --threads $number_of_cpus /dev/stdin 2> '$sort_err' > '$bam_sorted'`; | |
432 } | |
433 | |
434 sub BWA_call | |
435 { | |
436 my ( $index, $fastq, $sam, $mismatches, $number_of_cpus, $report ) = @_; | |
437 my ( $aln_err, $samse_err, $seq_num ) = ( $sam.'_aln.err', $sam.'_samse.err', 0 ); | |
438 print $report "-----------------------------\n"; | |
439 print $report "bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam'\n"; | |
440 `bwa aln -t $number_of_cpus -n $mismatches '$index' '$fastq' 2> '$aln_err' | bwa samse $index /dev/stdin '$fastq' 2> '$samse_err' > '$sam' `; | |
441 } | |
442 | |
443 sub rpms_rpkm | |
444 { | |
445 my ( $counthashR, $sizehashR, $mapped, $out_file, $piRNA_number, $miRNA_number, $bonafide_number ) =@_; | |
446 open(my $out, ">".$out_file) || die "cannot open normalized file $! \n"; | |
447 print $out "ID\treads counts\tRPKM"; | |
448 print $out "\tper million of piRNAs" if ($piRNA_number != 0); | |
449 print $out "\tper million of miRNAs" if ($miRNA_number != 0); | |
450 print $out "\tper million of bonafide reads" if ($bonafide_number != 0); | |
451 print $out "\n"; | |
452 foreach my $k ( sort keys %{$counthashR} ) | |
453 { | |
454 my ($rpkm, $pirna, $mirna, $bonafide) = (0,0,0,0); | |
455 | 486 |
456 $rpkm = ( $counthashR->{$k} * 1000000000) / ( $sizehashR->{$k} * $mapped) if ( $sizehashR->{$k} * $mapped) != 0 ; | 487 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef); |
457 print $out $k."\t".$counthashR->{$k}."\t"; printf $out "%.2f",$rpkm; | 488 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n"; |
458 | 489 |
459 if ($piRNA_number != 0 ) | 490 my $sequence = ''; |
460 { | 491 while(<$s_in>) |
461 $pirna = ( $counthashR->{$k} * 1000000) / $piRNA_number; | 492 { |
462 printf $out "\t%.2f",$pirna; | 493 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) |
463 } | 494 { |
464 if ($miRNA_number != 0 ) | 495 print $s_out $_ if defined ($hashRef); |
465 { | 496 print $s_uni_out $_; |
466 $mirna = ( $counthashR->{$k} * 1000000) / $miRNA_number; | 497 next; |
467 printf $out "\t%.2f",$mirna; | 498 } |
468 } | 499 my @line = split (/\t/,$_); |
469 if ($bonafide_number != 0 ) | 500 $sequence = $line[0]; |
470 { | 501 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) ) |
471 $bonafide = ( $counthashR->{$k} * 1000000) / $bonafide_number; | 502 { |
472 printf $out "\t%.2f",$bonafide; | 503 my $arn = $line[9]; |
473 } | 504 if ($line[1] & 16) |
474 print $out "\n"; | 505 { |
475 } | 506 $arn =reverse($arn); |
476 close $out; | 507 $arn =~ tr/atgcuATGCU/tacgaTACGA/; |
477 } | 508 } |
478 | 509 |
479 sub extract_sam | 510 if ( ( $line[1] == 16 || $line[1] == 0 ) ) |
480 { | 511 { |
481 my ( $hashRef, $sam_in, $sam_out, $sam_uni_out, $fastq_out, $fastq_uni_out ) = @_; | 512 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; |
482 | 513 print $s_out $_ if defined ($hashRef); |
483 open my $s_in, '<', $sam_in || die "cannot open $sam_in file $!\n"; | 514 if ( $line[11] eq "XT:A:U" ) |
484 | 515 { |
485 open my $f_out, '>', $fastq_out || die "cannot create $fastq_out $!\n"; | 516 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; |
486 open my $f_uni_out, '>', $fastq_uni_out || die "cannot create $fastq_uni_out $!\n"; | 517 print $s_uni_out $_ ; |
487 | 518 } |
488 open my $s_out, '>', $sam_out || die "cannot create $sam_out file $!\n" if defined ($hashRef); | 519 } |
489 open my $s_uni_out, '>', $sam_uni_out || die "cannot create $sam_uni_out file $!\n"; | 520 } |
490 | 521 } |
491 my $sequence = ''; | 522 close $s_in; close $s_out if defined ($hashRef); |
492 while(<$s_in>) | 523 close $s_uni_out; close $f_out; close $f_uni_out; |
493 { | |
494 if ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) | |
495 { | |
496 print $s_out $_ if defined ($hashRef); | |
497 print $s_uni_out $_; | |
498 next; | |
499 } | |
500 my @line = split (/\t/,$_); | |
501 $sequence = $line[0]; | |
502 if ( (! defined ($hashRef) )|| ( exists $hashRef->{$sequence} && $hashRef->{$sequence} == 1 ) ) | |
503 { | |
504 my $arn = $line[9]; | |
505 if ($line[1] & 16) | |
506 { | |
507 $arn =reverse($arn); | |
508 $arn =~ tr/atgcuATGCU/tacgaTACGA/; | |
509 } | |
510 | |
511 if ( ( $line[1] == 16 || $line[1] == 0 ) ) | |
512 { | |
513 print $f_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
514 print $s_out $_ if defined ($hashRef); | |
515 if ( $line[11] eq "XT:A:U" ) | |
516 { | |
517 print $f_uni_out "\@".$line[0]."\n".$arn."\n+\n".$line[10]."\n" ; | |
518 print $s_uni_out $_ ; | |
519 } | |
520 } | |
521 } | |
522 } | |
523 close $s_in; close $s_out if defined ($hashRef); | |
524 close $s_uni_out; close $f_out; close $f_uni_out; | |
525 } | 524 } |
526 | 525 |
527 sub get_fastq_seq | 526 sub get_fastq_seq |
528 { | 527 { |
529 my $fastq = shift; | 528 my $fastq = shift; |
530 my %hash; my $cmp = 0; | 529 my %hash; my $cmp = 0; |
531 | 530 |
532 open my $fic, '<', $fastq || die "cannot open input file $! \n"; | 531 open my $fic, '<', $fastq || die "cannot open input file $! \n"; |
533 while(<$fic>) | 532 while(<$fic>) |
534 { | 533 { |
535 chomp $_; | 534 chomp $_; |
536 $cmp++; | 535 $cmp++; |
537 if ($cmp % 4 == 1) | 536 if ($cmp % 4 == 1) |
538 { | 537 { |
539 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); | 538 die "file do not contain a @ at line $cmp\n" unless ($_ =~ /^\@/ ); |
540 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;} | 539 if ($_ =~ /^\@(.*)\s.*/) { $hash{$1} = 1;} |
541 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;} | 540 elsif ($_ =~ /^\@(.*)/) { $hash{$1} = 1;} |
542 } | 541 } |
543 elsif ($cmp % 4 == 3 ) | 542 elsif ($cmp % 4 == 3 ) |
544 { | 543 { |
545 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; | 544 die "file do not contain a + at line $cmp\n" unless $_ =~ /^\+/; |
546 } | 545 } |
547 } | 546 } |
548 close $fic; | 547 close $fic; |
549 return \%hash; | 548 return \%hash; |
550 } | 549 } |
551 | 550 |
552 1; | 551 1; |