annotate csem_wrapper.pl @ 3:3ce7ee6f43a7

Uploaded
author dongjun
date Mon, 12 Sep 2011 10:01:12 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
1 # Wrapper for CSEM with Bowtie
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
2 # Written by Dongjun Chung, Sep. 8, 2011
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
3
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
4 #!/usr/bin/env perl;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
5 #use warnings;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
6 #use strict;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
7 use File::Temp qw/tempfile/;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
8 use File::Temp qw/tmpnam/;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
9
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
10 # parse command arguments
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
11
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
12 die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
13
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
14 my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
15
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
16 # construct ref genome file (adapted from "genRef.pl")
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
17
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
18 open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
19
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
20 my $line;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
21
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
22 my $size = 0;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
23 my (@names, @lens) = ();
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
24 # $size: # of chromosomes
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
25 # @lens: chromosome size
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
26 # @names: chromosome name
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
27
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
28 for (my $i = 0; $i < 3; $i++) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
29 # skip unnecessary lines
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
30 $line = <IN>;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
31 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
32
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
33 while ( $line = <IN> ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
34 ++$size;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
35 chomp($line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
36 my ($seqn, $name, $len) = split(/[ \t]+/, $line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
37 push(@names, $name);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
38 push(@lens, $len);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
39 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
40 close(IN);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
41
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
42 my ($fh, $temp_reffile) = tempfile();
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
43 print $fh "$size\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
44 print $fh "@lens\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
45 print $fh "@names\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
46 close($fh);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
47
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
48 # extract read length from FASTA/FASTQ files
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
49
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
50 open( IN, $infile_name ) or die "Cannot open tag file!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
51
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
52 $line = <IN>;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
53 if ( $infile_format eq "fasta" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
54 while ( $line =~ /^>/ ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
55 $line = <IN>;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
56 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
57 } elsif ( $infile_format eq "fastq" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
58 while ( $line =~ /^@/ ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
59 $line = <IN>;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
60 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
61 } else {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
62 print "Inappropriate aligned read file format!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
63 exit 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
64 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
65 chomp($line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
66 my $read_length = length $line;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
67
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
68 close( IN );
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
69
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
70 # extract read ID
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
71
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
72 open( IN, $infile_name ) or die "Cannot open tag file!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
73
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
74 my @readID = ();
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
75 if ( $infile_format eq "fasta" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
76 foreach $line (<IN>) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
77 chomp($line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
78 if ( $line =~ /^>(\S+)/ ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
79 push @readID, $1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
80 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
81 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
82 } elsif ( $infile_format eq "fastq" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
83 foreach $line (<IN>) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
84 chomp($line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
85 if ( $line =~ /^@(\S+)/ ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
86 push @readID, $1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
87 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
88 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
89 } else {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
90 print "Inappropriate aligned read file format!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
91 exit 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
92 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
93
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
94 close( IN );
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
95
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
96 # run bowtie & csem
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
97
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
98 my $outfile_temp = tmpnam();
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
99
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
100 if ( $infile_format eq "fasta" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
101 system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
102 } elsif ( $infile_format eq "fastq" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
103 system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
104 } else {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
105 print "Inappropriate aligned read file format!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
106 exit 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
107 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
108
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
109 # post-process chromosome & position
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
110
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
111 open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
112 open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
113
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
114 foreach $line (<IN>) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
115 chomp($line);
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
116 my @element = split( /\s/, $line );
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
117 # assume columns are separated by some white space
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
118
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
119 # check for invalid line: may cause error in exporting step
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
120
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
121 if ( scalar(@element)<5 ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
122 next;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
123 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
124
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
125 # post-process lines
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
126
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
127 my ($id, $chr, $str, $loc, $prob) = @element;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
128 if ( $outfile_format ne "bed" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
129 # first base is 0 in bowtie or BED
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
130 # first base is 1 in table or GFF
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
131 $loc++;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
132 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
133 my $chrname = $names[$chr]; # translate chromosome
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
134
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
135 # write down processed lines
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
136 # - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
137
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
138 if ( $pseudo_tags eq "Y" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
139 # if we want to generate pseudo-tags,
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
140 # then threshold prob at 0.5 & round prob to integer (i.e., set to one)
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
141 # (exclude prob = 0.5 as well in order to avoid a read appears more than once)
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
142
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
143 if ( $prob <= 0.5 ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
144 next;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
145 } else {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
146 $prob = 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
147 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
148 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
149
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
150 # write down results
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
151
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
152 my $start;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
153 my $end;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
154 my $score;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
155
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
156 my $id_final = $readID[$id];
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
157 #my $id_final = $id;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
158
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
159 if ( $outfile_format eq "table" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
160 print OUT "$id_final\t$chrname $str $loc $prob\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
161 } elsif ( $outfile_format eq "bed" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
162 # BED
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
163 # - name: read ID
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
164 # - score = prob * 1000
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
165
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
166 $start = $loc;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
167 $end = $start + $read_length - 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
168 my $name = $id_final;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
169 $score = $prob * 1000;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
170
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
171 print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
172 } elsif ( $outfile_format eq "gff" ) {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
173 # GFF
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
174 # - source: "CSEM"
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
175 # - feature: read ID
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
176 # - score = prob * 1000
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
177
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
178 $start = $loc;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
179 $end = $start + $read_length - 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
180 my $source = "CSEM";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
181 my $feature = $id_final;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
182 $score = $prob * 1000;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
183
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
184 print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
185 } else {
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
186 print "Inappropriate output file format!\n";
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
187 exit 1;
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
188 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
189 }
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
190
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
191 close( IN );
3ce7ee6f43a7 Uploaded
dongjun
parents:
diff changeset
192 close( OUT );