Mercurial > repos > dongjun > csem
comparison csem_wrapper.pl @ 3:3ce7ee6f43a7
Uploaded
author | dongjun |
---|---|
date | Mon, 12 Sep 2011 10:01:12 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:7796cbc040c4 | 3:3ce7ee6f43a7 |
---|---|
1 # Wrapper for CSEM with Bowtie | |
2 # Written by Dongjun Chung, Sep. 8, 2011 | |
3 | |
4 #!/usr/bin/env perl; | |
5 #use warnings; | |
6 #use strict; | |
7 use File::Temp qw/tempfile/; | |
8 use File::Temp qw/tmpnam/; | |
9 | |
10 # parse command arguments | |
11 | |
12 die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11; | |
13 | |
14 my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV; | |
15 | |
16 # construct ref genome file (adapted from "genRef.pl") | |
17 | |
18 open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n"; | |
19 | |
20 my $line; | |
21 | |
22 my $size = 0; | |
23 my (@names, @lens) = (); | |
24 # $size: # of chromosomes | |
25 # @lens: chromosome size | |
26 # @names: chromosome name | |
27 | |
28 for (my $i = 0; $i < 3; $i++) { | |
29 # skip unnecessary lines | |
30 $line = <IN>; | |
31 } | |
32 | |
33 while ( $line = <IN> ) { | |
34 ++$size; | |
35 chomp($line); | |
36 my ($seqn, $name, $len) = split(/[ \t]+/, $line); | |
37 push(@names, $name); | |
38 push(@lens, $len); | |
39 } | |
40 close(IN); | |
41 | |
42 my ($fh, $temp_reffile) = tempfile(); | |
43 print $fh "$size\n"; | |
44 print $fh "@lens\n"; | |
45 print $fh "@names\n"; | |
46 close($fh); | |
47 | |
48 # extract read length from FASTA/FASTQ files | |
49 | |
50 open( IN, $infile_name ) or die "Cannot open tag file!\n"; | |
51 | |
52 $line = <IN>; | |
53 if ( $infile_format eq "fasta" ) { | |
54 while ( $line =~ /^>/ ) { | |
55 $line = <IN>; | |
56 } | |
57 } elsif ( $infile_format eq "fastq" ) { | |
58 while ( $line =~ /^@/ ) { | |
59 $line = <IN>; | |
60 } | |
61 } else { | |
62 print "Inappropriate aligned read file format!\n"; | |
63 exit 1; | |
64 } | |
65 chomp($line); | |
66 my $read_length = length $line; | |
67 | |
68 close( IN ); | |
69 | |
70 # extract read ID | |
71 | |
72 open( IN, $infile_name ) or die "Cannot open tag file!\n"; | |
73 | |
74 my @readID = (); | |
75 if ( $infile_format eq "fasta" ) { | |
76 foreach $line (<IN>) { | |
77 chomp($line); | |
78 if ( $line =~ /^>(\S+)/ ) { | |
79 push @readID, $1; | |
80 } | |
81 } | |
82 } elsif ( $infile_format eq "fastq" ) { | |
83 foreach $line (<IN>) { | |
84 chomp($line); | |
85 if ( $line =~ /^@(\S+)/ ) { | |
86 push @readID, $1; | |
87 } | |
88 } | |
89 } else { | |
90 print "Inappropriate aligned read file format!\n"; | |
91 exit 1; | |
92 } | |
93 | |
94 close( IN ); | |
95 | |
96 # run bowtie & csem | |
97 | |
98 my $outfile_temp = tmpnam(); | |
99 | |
100 if ( $infile_format eq "fasta" ) { | |
101 system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!" | |
102 } elsif ( $infile_format eq "fastq" ) { | |
103 system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!" | |
104 } else { | |
105 print "Inappropriate aligned read file format!\n"; | |
106 exit 1; | |
107 } | |
108 | |
109 # post-process chromosome & position | |
110 | |
111 open( IN, $outfile_temp ) or die "Cannot open csem file!\n"; | |
112 open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n"; | |
113 | |
114 foreach $line (<IN>) { | |
115 chomp($line); | |
116 my @element = split( /\s/, $line ); | |
117 # assume columns are separated by some white space | |
118 | |
119 # check for invalid line: may cause error in exporting step | |
120 | |
121 if ( scalar(@element)<5 ) { | |
122 next; | |
123 } | |
124 | |
125 # post-process lines | |
126 | |
127 my ($id, $chr, $str, $loc, $prob) = @element; | |
128 if ( $outfile_format ne "bed" ) { | |
129 # first base is 0 in bowtie or BED | |
130 # first base is 1 in table or GFF | |
131 $loc++; | |
132 } | |
133 my $chrname = $names[$chr]; # translate chromosome | |
134 | |
135 # write down processed lines | |
136 # - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl") | |
137 | |
138 if ( $pseudo_tags eq "Y" ) { | |
139 # if we want to generate pseudo-tags, | |
140 # then threshold prob at 0.5 & round prob to integer (i.e., set to one) | |
141 # (exclude prob = 0.5 as well in order to avoid a read appears more than once) | |
142 | |
143 if ( $prob <= 0.5 ) { | |
144 next; | |
145 } else { | |
146 $prob = 1; | |
147 } | |
148 } | |
149 | |
150 # write down results | |
151 | |
152 my $start; | |
153 my $end; | |
154 my $score; | |
155 | |
156 my $id_final = $readID[$id]; | |
157 #my $id_final = $id; | |
158 | |
159 if ( $outfile_format eq "table" ) { | |
160 print OUT "$id_final\t$chrname $str $loc $prob\n"; | |
161 } elsif ( $outfile_format eq "bed" ) { | |
162 # BED | |
163 # - name: read ID | |
164 # - score = prob * 1000 | |
165 | |
166 $start = $loc; | |
167 $end = $start + $read_length - 1; | |
168 my $name = $id_final; | |
169 $score = $prob * 1000; | |
170 | |
171 print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n"; | |
172 } elsif ( $outfile_format eq "gff" ) { | |
173 # GFF | |
174 # - source: "CSEM" | |
175 # - feature: read ID | |
176 # - score = prob * 1000 | |
177 | |
178 $start = $loc; | |
179 $end = $start + $read_length - 1; | |
180 my $source = "CSEM"; | |
181 my $feature = $id_final; | |
182 $score = $prob * 1000; | |
183 | |
184 print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n"; | |
185 } else { | |
186 print "Inappropriate output file format!\n"; | |
187 exit 1; | |
188 } | |
189 } | |
190 | |
191 close( IN ); | |
192 close( OUT ); |