annotate mosaics/inst/Perl/readfile2wig_PET.pl @ 6:c9e0cd67dd84 draft

Uploaded
author dongjun
date Thu, 10 Jan 2013 15:57:50 -0500
parents b6d0c6ceda2c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
1 ###################################################################
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
2 #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
3 # Process read files into WIG files
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
4 #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
5 # Command arguments:
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
6 # - infile: Input file (directory + file name)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
7 # - outdir: Directory of output files
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
8 # - format: File format
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
9 # - span: wig span
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
10 # - norm_const: normalizing constant
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
11 # - collapse: Maximum # of reads allowed at each position (skip if collapse=0)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
12 # - bychr: Construct bin-level files by chromosome? (Y or N)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
13 # - chrinfo: Is the file for chromosome info provided?
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
14 # - chrfile: File name for chromosome info (chr size)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
15 # - @excludeChr: Chromosomes to be excluded (vector)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
16 #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
17 # Supported file format:
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
18 # - eland_result, sam
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
19 #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
20 # Note
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
21 # - chromosomes are extracted from read files, after excluding invalid lines
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
22 # - uni-reads are assumed
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
23 #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
24 ###################################################################
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
25
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
26 #!/usr/bin/env perl;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
27 use warnings;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
28 use strict;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
29 use Cwd;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
30 use File::Basename;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
31
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
32 my ($infile, $outdir, $format, $span, $norm_const, $collapse, $bychr,
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
33 $chrinfo, $chrfile, @exclude_chr) = @ARGV;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
34
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
35 # extract only filename from $infile
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
36
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
37 my @pr = fileparse( $infile );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
38 my $filename = $pr[0];
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
39
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
40 # remember current working directory
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
41
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
42 my $pwd = cwd();
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
43
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
44 # construct bin-level data based on chromsome info, if provided
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
45
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
46 my %bin_count = ();
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
47 my $bin_start = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
48 my $bin_stop = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
49
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
50 if ( $chrinfo eq "Y" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
51 open CHR, "$chrfile" or die "Cannot open $chrfile\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
52
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
53 while (<CHR>) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
54 chomp;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
55 my ( $chrname, $chrsize ) = split /\s+/, $_;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
56
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
57 $bin_start = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
58 $bin_stop = int($chrsize/$span);
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
59
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
60 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
61 ${$bin_count{$chrname}}[$i] = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
62 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
63 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
64
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
65 close CHR;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
66 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
67
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
68 # chromosomes to be excluded
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
69
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
70 my $ecyn;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
71 my %ec_hash;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
72
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
73 if ( scalar(@exclude_chr) == 0 ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
74 $ecyn = "N";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
75 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
76 $ecyn = "Y";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
77 @ec_hash{ @exclude_chr } = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
78 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
79
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
80 # process PET read file
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
81 # (chromosome information is extracted from read file)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
82
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
83 open IN, "$infile" or die "Cannot open input file $infile\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
84
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
85 my %seen =();
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
86
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
87 my %seen_pos =(); # position (leftmost position, starting from 1)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
88 my %seen_str = (); # strand ('F' or 'R')
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
89
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
90 my $start;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
91 my $end;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
92
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
93 while(<IN>){
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
94 # parse
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
95
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
96 chomp;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
97
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
98 # procee read file, based on "format" option
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
99
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
100 my @parsed;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
101
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
102 if ( $format eq "eland_result" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
103 @parsed = eland_result( $_ );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
104 } elsif ( $format eq "sam" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
105 @parsed = sam( $_ );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
106 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
107 # Unsupported file format -> exit and return 1 to environment
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
108
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
109 exit 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
110 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
111
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
112 # skip if invalid line
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
113
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
114 next if ( $parsed[0] == 0 );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
115
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
116 # otherwise, process it
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
117
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
118 my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
119 my ($id_only, $pair) = split /\//, $id;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
120
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
121 # skip if chrt \in @exclude_chr
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
122
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
123 if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
124 next;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
125 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
126
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
127 # process coordinates
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
128
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
129 if ( exists $seen_pos{$id_only} ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
130 # if other pair was already observed (w.r.t. id),
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
131 # then write the coordinates of both reads in the pair
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
132
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
133 if ( $seen_str{$id_only} ne $str ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
134 # pos1 & pos2 should have different strands and pair numbers
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
135
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
136 my $pos1 = $seen_pos{$id_only};
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
137 my $pos2 = $pos;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
138
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
139 # in the pair, write upstream pos first,
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
140 # then downstream pos + read length - 1.
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
141 # [Note] eland_result: pos = leftmost position, starting from 1.
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
142
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
143 if ( $pos1 < $pos2 && $str eq "R" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
144 $pos2 += $read_length - 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
145 $start = $pos1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
146 $end = $pos2;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
147 } elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
148 $pos1 += $read_length - 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
149 $start = $pos2;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
150 $end = $pos1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
151 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
152 print "check id $id!\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
153 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
154
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
155 # process to bin-level files if collapse condition is satisfied
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
156
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
157 my $id_collapse = join("",$chrt,$start,$end);
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
158 $seen{$id_collapse}++;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
159
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
160 if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
161 next;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
162 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
163
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
164 # update bin count
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
165
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
166 if ( exists $bin_count{$chrt} ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
167 # if there is already a matrix for chromosome, update it
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
168
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
169 $bin_start = int(($start-1)/$span) ;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
170 $bin_stop = int(($end-1)/$span) ;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
171 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
172 ${$bin_count{$chrt}}[$i] += 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
173 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
174 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
175 # if there is no matrix for chromosome yet, construct one
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
176
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
177 @{$bin_count{$chrt}} = ();
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
178 $bin_start = int(($start-1)/$span) ;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
179 $bin_stop = int(($end-1)/$span) ;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
180 for (my $i = $bin_start; $i <= $bin_stop; $i++) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
181 ${$bin_count{$chrt}}[$i] += 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
182 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
183 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
184
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
185 # delete the key to save memory
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
186
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
187 delete $seen_pos{$id_only};
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
188 delete $seen_str{$id_only};
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
189 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
190 print "inappropriate pair for id $id.\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
191 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
192 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
193 # if other pair was not observed yet, record it
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
194
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
195 $seen_pos{$id_only} = $pos;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
196 $seen_str{$id_only} = $str;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
197 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
198 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
199
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
200 close( IN );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
201
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
202 # move to output directory
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
203
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
204 chdir($outdir);
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
205
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
206 # write bin-level files
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
207
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
208 if ( $bychr eq "N" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
209 # genome-wide version: all chromosome in one file
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
210
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
211 my $outfile = $filename."_span".$span.".wig";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
212 open OUT, ">$outfile" or die "Cannot open $outfile\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
213
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
214 print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
215 print OUT "description=\"".$outfile."\"\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
216
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
217 foreach my $chr_id (keys %bin_count) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
218 my @bin_count_chr = @{$bin_count{$chr_id}};
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
219 print OUT "variableStep chrom=$chr_id span=$span\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
220
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
221 for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
222 my $coord = $i*$span + 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
223 if ( $bin_count_chr[$i] ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
224 my $count_final = $bin_count_chr[$i] * $norm_const;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
225 print OUT "$coord $count_final\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
226 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
227 print OUT "$coord 0\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
228 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
229 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
230 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
231
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
232 close OUT;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
233 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
234 # chromosome version: one chromosome in each file
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
235
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
236 foreach my $chr_id (keys %bin_count) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
237 my $outfile = $filename."_span".$span."_".$chr_id.".wig";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
238 open OUT, ">$outfile" or die "Cannot open $outfile\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
239
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
240 print OUT "track type=wiggle_0 name=\"".$outfile."\" ";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
241 print OUT "description=\"".$outfile."\"\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
242
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
243 my @bin_count_chr = @{$bin_count{$chr_id}};
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
244
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
245 print OUT "variableStep chrom=$chr_id span=$span\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
246
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
247 for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
248 my $coord = $i*$span + 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
249 if ( $bin_count_chr[$i] ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
250 my $count_final = $bin_count_chr[$i] * $norm_const;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
251 print OUT "$coord $count_final\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
252 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
253 print OUT "$coord 0\n";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
254 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
255 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
256 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
257
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
258 close OUT;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
259 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
260
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
261
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
262 ################################################################################
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
263 # #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
264 # subroutines #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
265 # #
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
266 ################################################################################
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
267
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
268 # -------------------------- subroutine: eland result --------------------------
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
269
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
270 sub eland_result {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
271 my $line = shift;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
272
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
273 # parse line
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
274
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
275 my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
276 my $read_length = length $seq;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
277
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
278 # exclude invalid lines
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
279
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
280 if ( $map eq "QC" || $map eq "NM" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
281 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
282 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
283 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
284
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
285 if ( $chrt eq "QC" || $chrt eq "NM" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
286 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
287 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
288 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
289
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
290 # exclude multi-reads
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
291
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
292 if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
293 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
294 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
295 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
296
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
297 if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
298 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
299 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
300 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
301
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
302 my @pos_sets = split( /,/, $pos );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
303 if ( scalar(@pos_sets) > 1 ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
304 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
305 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
306 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
307
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
308 # return
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
309
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
310 my $status = 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
311 my $prob = 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
312 my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
313 return @parsed;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
314 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
315
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
316 # -------------------------- subroutine: SAM -----------------------------------
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
317
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
318 sub sam {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
319 my $line = shift;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
320
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
321 # parse line
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
322
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
323 if ( $line =~ /^[^@].+/ ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
324 # exclude lines starting with "@" (comments)
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
325
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
326 my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
327 $pos = int($pos);
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
328 my $read_length = length $seq;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
329
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
330 # strand
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
331
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
332 my $str;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
333
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
334 if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
335 # exclude invalid lines
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
336
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
337 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
338 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
339 } elsif ( $bwflag & 16 ) {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
340 # reverse strand
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
341
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
342 $str = "R";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
343 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
344 $str = "F";
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
345 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
346
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
347 # exclude multi-reads?
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
348
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
349 # return
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
350
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
351 my $status = 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
352 my $prob = 1;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
353 my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob );
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
354 return @parsed;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
355 } else {
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
356 my @status = 0;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
357 return @status;
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
358 }
b6d0c6ceda2c Uploaded
dongjun
parents:
diff changeset
359 }