2
|
1 ###################################################################
|
|
2 #
|
|
3 # Process read files into WIG files
|
|
4 #
|
|
5 # Command arguments:
|
|
6 # - infile: Input file (directory + file name)
|
|
7 # - outdir: Directory of output files
|
|
8 # - format: File format
|
|
9 # - span: wig span
|
|
10 # - norm_const: normalizing constant
|
|
11 # - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
|
|
12 # - bychr: Construct bin-level files by chromosome? (Y or N)
|
|
13 # - chrinfo: Is the file for chromosome info provided?
|
|
14 # - chrfile: File name for chromosome info (chr size)
|
|
15 # - @excludeChr: Chromosomes to be excluded (vector)
|
|
16 #
|
|
17 # Supported file format:
|
|
18 # - eland_result, sam
|
|
19 #
|
|
20 # Note
|
|
21 # - chromosomes are extracted from read files, after excluding invalid lines
|
|
22 # - uni-reads are assumed
|
|
23 #
|
|
24 ###################################################################
|
|
25
|
|
26 #!/usr/bin/env perl;
|
|
27 use warnings;
|
|
28 use strict;
|
|
29 use Cwd;
|
|
30 use File::Basename;
|
|
31
|
|
32 my ($infile, $outdir, $format, $span, $norm_const, $collapse, $bychr,
|
|
33 $chrinfo, $chrfile, @exclude_chr) = @ARGV;
|
|
34
|
|
35 # extract only filename from $infile
|
|
36
|
|
37 my @pr = fileparse( $infile );
|
|
38 my $filename = $pr[0];
|
|
39
|
|
40 # remember current working directory
|
|
41
|
|
42 my $pwd = cwd();
|
|
43
|
|
44 # construct bin-level data based on chromsome info, if provided
|
|
45
|
|
46 my %bin_count = ();
|
|
47 my $bin_start = 0;
|
|
48 my $bin_stop = 0;
|
|
49
|
|
50 if ( $chrinfo eq "Y" ) {
|
|
51 open CHR, "$chrfile" or die "Cannot open $chrfile\n";
|
|
52
|
|
53 while (<CHR>) {
|
|
54 chomp;
|
|
55 my ( $chrname, $chrsize ) = split /\s+/, $_;
|
|
56
|
|
57 $bin_start = 0;
|
|
58 $bin_stop = int($chrsize/$span);
|
|
59
|
|
60 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
|
|
61 ${$bin_count{$chrname}}[$i] = 0;
|
|
62 }
|
|
63 }
|
|
64
|
|
65 close CHR;
|
|
66 }
|
|
67
|
|
68 # chromosomes to be excluded
|
|
69
|
|
70 my $ecyn;
|
|
71 my %ec_hash;
|
|
72
|
|
73 if ( scalar(@exclude_chr) == 0 ) {
|
|
74 $ecyn = "N";
|
|
75 } else {
|
|
76 $ecyn = "Y";
|
|
77 @ec_hash{ @exclude_chr } = 0;
|
|
78 }
|
|
79
|
|
80 # process PET read file
|
|
81 # (chromosome information is extracted from read file)
|
|
82
|
|
83 open IN, "$infile" or die "Cannot open input file $infile\n";
|
|
84
|
|
85 my %seen =();
|
|
86
|
|
87 my %seen_pos =(); # position (leftmost position, starting from 1)
|
|
88 my %seen_str = (); # strand ('F' or 'R')
|
|
89
|
|
90 my $start;
|
|
91 my $end;
|
|
92
|
|
93 while(<IN>){
|
|
94 # parse
|
|
95
|
|
96 chomp;
|
|
97
|
|
98 # procee read file, based on "format" option
|
|
99
|
|
100 my @parsed;
|
|
101
|
|
102 if ( $format eq "eland_result" ) {
|
|
103 @parsed = eland_result( $_ );
|
|
104 } elsif ( $format eq "sam" ) {
|
|
105 @parsed = sam( $_ );
|
|
106 } else {
|
|
107 # Unsupported file format -> exit and return 1 to environment
|
|
108
|
|
109 exit 1;
|
|
110 }
|
|
111
|
|
112 # skip if invalid line
|
|
113
|
|
114 next if ( $parsed[0] == 0 );
|
|
115
|
|
116 # otherwise, process it
|
|
117
|
|
118 my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed;
|
|
119 my ($id_only, $pair) = split /\//, $id;
|
|
120
|
|
121 # skip if chrt \in @exclude_chr
|
|
122
|
|
123 if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
|
|
124 next;
|
|
125 }
|
|
126
|
|
127 # process coordinates
|
|
128
|
|
129 if ( exists $seen_pos{$id_only} ) {
|
|
130 # if other pair was already observed (w.r.t. id),
|
|
131 # then write the coordinates of both reads in the pair
|
|
132
|
|
133 if ( $seen_str{$id_only} ne $str ) {
|
|
134 # pos1 & pos2 should have different strands and pair numbers
|
|
135
|
|
136 my $pos1 = $seen_pos{$id_only};
|
|
137 my $pos2 = $pos;
|
|
138
|
|
139 # in the pair, write upstream pos first,
|
|
140 # then downstream pos + read length - 1.
|
|
141 # [Note] eland_result: pos = leftmost position, starting from 1.
|
|
142
|
|
143 if ( $pos1 < $pos2 && $str eq "R" ) {
|
|
144 $pos2 += $read_length - 1;
|
|
145 $start = $pos1;
|
|
146 $end = $pos2;
|
|
147 } elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) {
|
|
148 $pos1 += $read_length - 1;
|
|
149 $start = $pos2;
|
|
150 $end = $pos1;
|
|
151 } else {
|
|
152 print "check id $id!\n";
|
|
153 }
|
|
154
|
|
155 # process to bin-level files if collapse condition is satisfied
|
|
156
|
|
157 my $id_collapse = join("",$chrt,$start,$end);
|
|
158 $seen{$id_collapse}++;
|
|
159
|
|
160 if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
|
|
161 next;
|
|
162 }
|
|
163
|
|
164 # update bin count
|
|
165
|
|
166 if ( exists $bin_count{$chrt} ) {
|
|
167 # if there is already a matrix for chromosome, update it
|
|
168
|
|
169 $bin_start = int(($start-1)/$span) ;
|
|
170 $bin_stop = int(($end-1)/$span) ;
|
|
171 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
|
|
172 ${$bin_count{$chrt}}[$i] += 1;
|
|
173 }
|
|
174 } else {
|
|
175 # if there is no matrix for chromosome yet, construct one
|
|
176
|
|
177 @{$bin_count{$chrt}} = ();
|
|
178 $bin_start = int(($start-1)/$span) ;
|
|
179 $bin_stop = int(($end-1)/$span) ;
|
|
180 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
|
|
181 ${$bin_count{$chrt}}[$i] += 1;
|
|
182 }
|
|
183 }
|
|
184
|
|
185 # delete the key to save memory
|
|
186
|
|
187 delete $seen_pos{$id_only};
|
|
188 delete $seen_str{$id_only};
|
|
189 } else {
|
|
190 print "inappropriate pair for id $id.\n";
|
|
191 }
|
|
192 } else {
|
|
193 # if other pair was not observed yet, record it
|
|
194
|
|
195 $seen_pos{$id_only} = $pos;
|
|
196 $seen_str{$id_only} = $str;
|
|
197 }
|
|
198 }
|
|
199
|
|
200 close( IN );
|
|
201
|
|
202 # move to output directory
|
|
203
|
|
204 chdir($outdir);
|
|
205
|
|
206 # write bin-level files
|
|
207
|
|
208 if ( $bychr eq "N" ) {
|
|
209 # genome-wide version: all chromosome in one file
|
|
210
|
|
211 my $outfile = $filename."_span".$span.".wig";
|
|
212 open OUT, ">$outfile" or die "Cannot open $outfile\n";
|
|
213
|
|
214 print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
|
|
215 print OUT "description=\"".$outfile."\"\n";
|
|
216
|
|
217 foreach my $chr_id (keys %bin_count) {
|
|
218 my @bin_count_chr = @{$bin_count{$chr_id}};
|
|
219 print OUT "variableStep chrom=$chr_id span=$span\n";
|
|
220
|
|
221 for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
|
|
222 my $coord = $i*$span + 1;
|
|
223 if ( $bin_count_chr[$i] ) {
|
|
224 my $count_final = $bin_count_chr[$i] * $norm_const;
|
|
225 print OUT "$coord $count_final\n";
|
|
226 } else {
|
|
227 print OUT "$coord 0\n";
|
|
228 }
|
|
229 }
|
|
230 }
|
|
231
|
|
232 close OUT;
|
|
233 } else {
|
|
234 # chromosome version: one chromosome in each file
|
|
235
|
|
236 foreach my $chr_id (keys %bin_count) {
|
|
237 my $outfile = $filename."_span".$span."_".$chr_id.".wig";
|
|
238 open OUT, ">$outfile" or die "Cannot open $outfile\n";
|
|
239
|
|
240 print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
|
|
241 print OUT "description=\"".$outfile."\"\n";
|
|
242
|
|
243 my @bin_count_chr = @{$bin_count{$chr_id}};
|
|
244
|
|
245 print OUT "variableStep chrom=$chr_id span=$span\n";
|
|
246
|
|
247 for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
|
|
248 my $coord = $i*$span + 1;
|
|
249 if ( $bin_count_chr[$i] ) {
|
|
250 my $count_final = $bin_count_chr[$i] * $norm_const;
|
|
251 print OUT "$coord $count_final\n";
|
|
252 } else {
|
|
253 print OUT "$coord 0\n";
|
|
254 }
|
|
255 }
|
|
256 }
|
|
257
|
|
258 close OUT;
|
|
259 }
|
|
260
|
|
261
|
|
262 ################################################################################
|
|
263 # #
|
|
264 # subroutines #
|
|
265 # #
|
|
266 ################################################################################
|
|
267
|
|
268 # -------------------------- subroutine: eland result --------------------------
|
|
269
|
|
270 sub eland_result {
|
|
271 my $line = shift;
|
|
272
|
|
273 # parse line
|
|
274
|
|
275 my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
|
|
276 my $read_length = length $seq;
|
|
277
|
|
278 # exclude invalid lines
|
|
279
|
|
280 if ( $map eq "QC" || $map eq "NM" ) {
|
|
281 my @status = 0;
|
|
282 return @status;
|
|
283 }
|
|
284
|
|
285 if ( $chrt eq "QC" || $chrt eq "NM" ) {
|
|
286 my @status = 0;
|
|
287 return @status;
|
|
288 }
|
|
289
|
|
290 # exclude multi-reads
|
|
291
|
|
292 if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
|
|
293 my @status = 0;
|
|
294 return @status;
|
|
295 }
|
|
296
|
|
297 if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
|
|
298 my @status = 0;
|
|
299 return @status;
|
|
300 }
|
|
301
|
|
302 my @pos_sets = split( /,/, $pos );
|
|
303 if ( scalar(@pos_sets) > 1 ) {
|
|
304 my @status = 0;
|
|
305 return @status;
|
|
306 }
|
|
307
|
|
308 # return
|
|
309
|
|
310 my $status = 1;
|
|
311 my $prob = 1;
|
|
312 my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
|
|
313 return @parsed;
|
|
314 }
|
|
315
|
|
316 # -------------------------- subroutine: SAM -----------------------------------
|
|
317
|
|
318 sub sam {
|
|
319 my $line = shift;
|
|
320
|
|
321 # parse line
|
|
322
|
|
323 if ( $line =~ /^[^@].+/ ) {
|
|
324 # exclude lines starting with "@" (comments)
|
|
325
|
|
326 my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
|
|
327 $pos = int($pos);
|
|
328 my $read_length = length $seq;
|
|
329
|
|
330 # strand
|
|
331
|
|
332 my $str;
|
|
333
|
|
334 if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
|
|
335 # exclude invalid lines
|
|
336
|
|
337 my @status = 0;
|
|
338 return @status;
|
|
339 } elsif ( $bwflag & 16 ) {
|
|
340 # reverse strand
|
|
341
|
|
342 $str = "R";
|
|
343 } else {
|
|
344 $str = "F";
|
|
345 }
|
|
346
|
|
347 # exclude multi-reads?
|
|
348
|
|
349 # return
|
|
350
|
|
351 my $status = 1;
|
|
352 my $prob = 1;
|
|
353 my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
|
|
354 return @parsed;
|
|
355 } else {
|
|
356 my @status = 0;
|
|
357 return @status;
|
|
358 }
|
|
359 }
|