3
|
1 # Wrapper for CSEM with Bowtie
|
|
2 # Written by Dongjun Chung, Sep. 8, 2011
|
|
3
|
|
4 #!/usr/bin/env perl;
|
|
5 #use warnings;
|
|
6 #use strict;
|
|
7 use File::Temp qw/tempfile/;
|
|
8 use File::Temp qw/tmpnam/;
|
|
9
|
|
10 # parse command arguments
|
|
11
|
|
12 die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
|
|
13
|
|
14 my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
|
|
15
|
|
16 # construct ref genome file (adapted from "genRef.pl")
|
|
17
|
|
18 open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
|
|
19
|
|
20 my $line;
|
|
21
|
|
22 my $size = 0;
|
|
23 my (@names, @lens) = ();
|
|
24 # $size: # of chromosomes
|
|
25 # @lens: chromosome size
|
|
26 # @names: chromosome name
|
|
27
|
|
28 for (my $i = 0; $i < 3; $i++) {
|
|
29 # skip unnecessary lines
|
|
30 $line = <IN>;
|
|
31 }
|
|
32
|
|
33 while ( $line = <IN> ) {
|
|
34 ++$size;
|
|
35 chomp($line);
|
|
36 my ($seqn, $name, $len) = split(/[ \t]+/, $line);
|
|
37 push(@names, $name);
|
|
38 push(@lens, $len);
|
|
39 }
|
|
40 close(IN);
|
|
41
|
|
42 my ($fh, $temp_reffile) = tempfile();
|
|
43 print $fh "$size\n";
|
|
44 print $fh "@lens\n";
|
|
45 print $fh "@names\n";
|
|
46 close($fh);
|
|
47
|
|
48 # extract read length from FASTA/FASTQ files
|
|
49
|
|
50 open( IN, $infile_name ) or die "Cannot open tag file!\n";
|
|
51
|
|
52 $line = <IN>;
|
|
53 if ( $infile_format eq "fasta" ) {
|
|
54 while ( $line =~ /^>/ ) {
|
|
55 $line = <IN>;
|
|
56 }
|
|
57 } elsif ( $infile_format eq "fastq" ) {
|
|
58 while ( $line =~ /^@/ ) {
|
|
59 $line = <IN>;
|
|
60 }
|
|
61 } else {
|
|
62 print "Inappropriate aligned read file format!\n";
|
|
63 exit 1;
|
|
64 }
|
|
65 chomp($line);
|
|
66 my $read_length = length $line;
|
|
67
|
|
68 close( IN );
|
|
69
|
|
70 # extract read ID
|
|
71
|
|
72 open( IN, $infile_name ) or die "Cannot open tag file!\n";
|
|
73
|
|
74 my @readID = ();
|
|
75 if ( $infile_format eq "fasta" ) {
|
|
76 foreach $line (<IN>) {
|
|
77 chomp($line);
|
|
78 if ( $line =~ /^>(\S+)/ ) {
|
|
79 push @readID, $1;
|
|
80 }
|
|
81 }
|
|
82 } elsif ( $infile_format eq "fastq" ) {
|
|
83 foreach $line (<IN>) {
|
|
84 chomp($line);
|
|
85 if ( $line =~ /^@(\S+)/ ) {
|
|
86 push @readID, $1;
|
|
87 }
|
|
88 }
|
|
89 } else {
|
|
90 print "Inappropriate aligned read file format!\n";
|
|
91 exit 1;
|
|
92 }
|
|
93
|
|
94 close( IN );
|
|
95
|
|
96 # run bowtie & csem
|
|
97
|
|
98 my $outfile_temp = tmpnam();
|
|
99
|
|
100 if ( $infile_format eq "fasta" ) {
|
|
101 system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
|
|
102 } elsif ( $infile_format eq "fastq" ) {
|
|
103 system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
|
|
104 } else {
|
|
105 print "Inappropriate aligned read file format!\n";
|
|
106 exit 1;
|
|
107 }
|
|
108
|
|
109 # post-process chromosome & position
|
|
110
|
|
111 open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
|
|
112 open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
|
|
113
|
|
114 foreach $line (<IN>) {
|
|
115 chomp($line);
|
|
116 my @element = split( /\s/, $line );
|
|
117 # assume columns are separated by some white space
|
|
118
|
|
119 # check for invalid line: may cause error in exporting step
|
|
120
|
|
121 if ( scalar(@element)<5 ) {
|
|
122 next;
|
|
123 }
|
|
124
|
|
125 # post-process lines
|
|
126
|
|
127 my ($id, $chr, $str, $loc, $prob) = @element;
|
|
128 if ( $outfile_format ne "bed" ) {
|
|
129 # first base is 0 in bowtie or BED
|
|
130 # first base is 1 in table or GFF
|
|
131 $loc++;
|
|
132 }
|
|
133 my $chrname = $names[$chr]; # translate chromosome
|
|
134
|
|
135 # write down processed lines
|
|
136 # - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
|
|
137
|
|
138 if ( $pseudo_tags eq "Y" ) {
|
|
139 # if we want to generate pseudo-tags,
|
|
140 # then threshold prob at 0.5 & round prob to integer (i.e., set to one)
|
|
141 # (exclude prob = 0.5 as well in order to avoid a read appears more than once)
|
|
142
|
|
143 if ( $prob <= 0.5 ) {
|
|
144 next;
|
|
145 } else {
|
|
146 $prob = 1;
|
|
147 }
|
|
148 }
|
|
149
|
|
150 # write down results
|
|
151
|
|
152 my $start;
|
|
153 my $end;
|
|
154 my $score;
|
|
155
|
|
156 my $id_final = $readID[$id];
|
|
157 #my $id_final = $id;
|
|
158
|
|
159 if ( $outfile_format eq "table" ) {
|
|
160 print OUT "$id_final\t$chrname $str $loc $prob\n";
|
|
161 } elsif ( $outfile_format eq "bed" ) {
|
|
162 # BED
|
|
163 # - name: read ID
|
|
164 # - score = prob * 1000
|
|
165
|
|
166 $start = $loc;
|
|
167 $end = $start + $read_length - 1;
|
|
168 my $name = $id_final;
|
|
169 $score = $prob * 1000;
|
|
170
|
|
171 print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
|
|
172 } elsif ( $outfile_format eq "gff" ) {
|
|
173 # GFF
|
|
174 # - source: "CSEM"
|
|
175 # - feature: read ID
|
|
176 # - score = prob * 1000
|
|
177
|
|
178 $start = $loc;
|
|
179 $end = $start + $read_length - 1;
|
|
180 my $source = "CSEM";
|
|
181 my $feature = $id_final;
|
|
182 $score = $prob * 1000;
|
|
183
|
|
184 print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
|
|
185 } else {
|
|
186 print "Inappropriate output file format!\n";
|
|
187 exit 1;
|
|
188 }
|
|
189 }
|
|
190
|
|
191 close( IN );
|
|
192 close( OUT );
|