comparison csem_wrapper.pl @ 3:3ce7ee6f43a7

Uploaded
author dongjun
date Mon, 12 Sep 2011 10:01:12 -0400
parents
children
comparison
equal deleted inserted replaced
2:7796cbc040c4 3:3ce7ee6f43a7
1 # Wrapper for CSEM with Bowtie
2 # Written by Dongjun Chung, Sep. 8, 2011
3
4 #!/usr/bin/env perl;
5 #use warnings;
6 #use strict;
7 use File::Temp qw/tempfile/;
8 use File::Temp qw/tmpnam/;
9
10 # parse command arguments
11
12 die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
13
14 my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
15
16 # construct ref genome file (adapted from "genRef.pl")
17
18 open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
19
20 my $line;
21
22 my $size = 0;
23 my (@names, @lens) = ();
24 # $size: # of chromosomes
25 # @lens: chromosome size
26 # @names: chromosome name
27
28 for (my $i = 0; $i < 3; $i++) {
29 # skip unnecessary lines
30 $line = <IN>;
31 }
32
33 while ( $line = <IN> ) {
34 ++$size;
35 chomp($line);
36 my ($seqn, $name, $len) = split(/[ \t]+/, $line);
37 push(@names, $name);
38 push(@lens, $len);
39 }
40 close(IN);
41
42 my ($fh, $temp_reffile) = tempfile();
43 print $fh "$size\n";
44 print $fh "@lens\n";
45 print $fh "@names\n";
46 close($fh);
47
48 # extract read length from FASTA/FASTQ files
49
50 open( IN, $infile_name ) or die "Cannot open tag file!\n";
51
52 $line = <IN>;
53 if ( $infile_format eq "fasta" ) {
54 while ( $line =~ /^>/ ) {
55 $line = <IN>;
56 }
57 } elsif ( $infile_format eq "fastq" ) {
58 while ( $line =~ /^@/ ) {
59 $line = <IN>;
60 }
61 } else {
62 print "Inappropriate aligned read file format!\n";
63 exit 1;
64 }
65 chomp($line);
66 my $read_length = length $line;
67
68 close( IN );
69
70 # extract read ID
71
72 open( IN, $infile_name ) or die "Cannot open tag file!\n";
73
74 my @readID = ();
75 if ( $infile_format eq "fasta" ) {
76 foreach $line (<IN>) {
77 chomp($line);
78 if ( $line =~ /^>(\S+)/ ) {
79 push @readID, $1;
80 }
81 }
82 } elsif ( $infile_format eq "fastq" ) {
83 foreach $line (<IN>) {
84 chomp($line);
85 if ( $line =~ /^@(\S+)/ ) {
86 push @readID, $1;
87 }
88 }
89 } else {
90 print "Inappropriate aligned read file format!\n";
91 exit 1;
92 }
93
94 close( IN );
95
96 # run bowtie & csem
97
98 my $outfile_temp = tmpnam();
99
100 if ( $infile_format eq "fasta" ) {
101 system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
102 } elsif ( $infile_format eq "fastq" ) {
103 system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
104 } else {
105 print "Inappropriate aligned read file format!\n";
106 exit 1;
107 }
108
109 # post-process chromosome & position
110
111 open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
112 open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
113
114 foreach $line (<IN>) {
115 chomp($line);
116 my @element = split( /\s/, $line );
117 # assume columns are separated by some white space
118
119 # check for invalid line: may cause error in exporting step
120
121 if ( scalar(@element)<5 ) {
122 next;
123 }
124
125 # post-process lines
126
127 my ($id, $chr, $str, $loc, $prob) = @element;
128 if ( $outfile_format ne "bed" ) {
129 # first base is 0 in bowtie or BED
130 # first base is 1 in table or GFF
131 $loc++;
132 }
133 my $chrname = $names[$chr]; # translate chromosome
134
135 # write down processed lines
136 # - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
137
138 if ( $pseudo_tags eq "Y" ) {
139 # if we want to generate pseudo-tags,
140 # then threshold prob at 0.5 & round prob to integer (i.e., set to one)
141 # (exclude prob = 0.5 as well in order to avoid a read appears more than once)
142
143 if ( $prob <= 0.5 ) {
144 next;
145 } else {
146 $prob = 1;
147 }
148 }
149
150 # write down results
151
152 my $start;
153 my $end;
154 my $score;
155
156 my $id_final = $readID[$id];
157 #my $id_final = $id;
158
159 if ( $outfile_format eq "table" ) {
160 print OUT "$id_final\t$chrname $str $loc $prob\n";
161 } elsif ( $outfile_format eq "bed" ) {
162 # BED
163 # - name: read ID
164 # - score = prob * 1000
165
166 $start = $loc;
167 $end = $start + $read_length - 1;
168 my $name = $id_final;
169 $score = $prob * 1000;
170
171 print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
172 } elsif ( $outfile_format eq "gff" ) {
173 # GFF
174 # - source: "CSEM"
175 # - feature: read ID
176 # - score = prob * 1000
177
178 $start = $loc;
179 $end = $start + $read_length - 1;
180 my $source = "CSEM";
181 my $feature = $id_final;
182 $score = $prob * 1000;
183
184 print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
185 } else {
186 print "Inappropriate output file format!\n";
187 exit 1;
188 }
189 }
190
191 close( IN );
192 close( OUT );