Mercurial > repos > big-tiandm > mirplant2
comparison miRPlant.pl @ 14:554fbaf5f451 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 25 Jul 2014 05:21:31 -0400 |
parents | |
children | 0c4e11018934 |
comparison
equal
deleted
inserted
replaced
13:d1cc2e6ecf90 | 14:554fbaf5f451 |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Tian Dongmei | |
4 #Email: tiandm@big.ac.cn | |
5 #Date: 2014-4-22 | |
6 #Modified: | |
7 #Description: plant microRNA prediction | |
8 my $version=1.00; | |
9 | |
10 use strict; | |
11 use Getopt::Long; | |
12 use threads; | |
13 use threads::shared; | |
14 use File::Path; | |
15 use File::Basename; | |
16 #use RNA; | |
17 use Term::ANSIColor; | |
18 | |
19 my %opts; | |
20 GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); | |
21 if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments | |
22 &usage; | |
23 } | |
24 | |
25 my $time=&Time(); | |
26 print "miPlant program start:\n The time is $time!\n"; | |
27 print "Command line:\n $0 @ARGV\n"; | |
28 | |
29 my $format=$opts{'format'}; | |
30 if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { | |
31 &printErr(); | |
32 die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; | |
33 } | |
34 | |
35 my $phred_qv=64; | |
36 | |
37 | |
38 my @inputfiles=@{$opts{'i'}}; | |
39 my @inputtags=@{$opts{'tag'}}; | |
40 | |
41 my $mypath=`pwd`; | |
42 chomp $mypath; | |
43 | |
44 my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/"; | |
45 | |
46 | |
47 unless ($dir=~/\/$/) {$dir.="/";} | |
48 if (not -d $dir) { | |
49 mkdir $dir; | |
50 } | |
51 my $config=$dir."/input_config"; | |
52 open CONFIG,">$config"; | |
53 for (my $i=0;$i<@inputfiles;$i++) { | |
54 print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; | |
55 } | |
56 close CONFIG; | |
57 | |
58 my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; | |
59 | |
60 my $a="ATCTCGTATG"; #adapter | |
61 if (defined $opts{'a'}) {$a=$opts{'a'};} | |
62 | |
63 my $m=6; #adapter minimum mapped nt | |
64 if (defined $opts{'M'}) {$m=$opts{'M'};} | |
65 | |
66 my $t=1; #threads number | |
67 if (defined $opts{'t'}) {$t=$opts{'t'};} | |
68 | |
69 my $min_nt=19; # minimum reads length | |
70 if (defined $opts{'min'}) {$min_nt=$opts{'min'};} | |
71 | |
72 my $max_nt=28; #maximum reads length | |
73 if (defined $opts{'max'}) {$max_nt=$opts{'max'};} | |
74 | |
75 my $mis=0; #mismatch number for microRNA | |
76 if (defined $opts{'mis'}) {$mis=$opts{'mis'};} | |
77 | |
78 my $mis_rfam=0;# mismatch number for rfam | |
79 if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} | |
80 | |
81 my $hit=25; # maximum reads mapping hits in genome | |
82 if (defined $opts{'r'}) {$hit=$opts{'r'};} | |
83 | |
84 my $upstream = 2; # microRNA 5' extension | |
85 $upstream = $opts{'e'} if(defined $opts{'e'}); | |
86 | |
87 my $downstream = 5;# microRNA 3' extension | |
88 $downstream = $opts{'f'} if(defined $opts{'f'}); | |
89 | |
90 my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; | |
91 my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; | |
92 my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; | |
93 | |
94 $time=&Time(); | |
95 print "$time, Checking input file!\n"; | |
96 | |
97 my (@filein,@mark,@clean); | |
98 #&read_config(); | |
99 @filein=@inputfiles; | |
100 @mark=@inputtags; | |
101 | |
102 &checkfa($opts{pre}); | |
103 &checkfa($opts{mat}); | |
104 &checkfa($opts{gfa}); | |
105 | |
106 | |
107 ##### clip adpter --> clean data start | |
108 $time=&Time(); | |
109 print "$time, Preprocess:\n trim adapter, reads collapse and filter reads by length.\n"; | |
110 | |
111 $time=~s/:/-/g; | |
112 $time=~s/ /-/g; | |
113 my $preprocess=$dir."preProcess_${time}/"; | |
114 mkdir $preprocess; | |
115 my $can_use_threads = eval 'use threads; 1'; | |
116 if ($can_use_threads) { | |
117 # Do processing using threads | |
118 print "Do processing using threads\n"; | |
119 my @filein1=@filein; my @mark1=@mark; | |
120 while (@filein1>0) { | |
121 my @thrs; my @res; | |
122 for (my $i=0;$i<$t ;$i++) { | |
123 last if(@filein1==0); | |
124 my $in=shift @filein1; | |
125 my $out=shift @mark1; | |
126 push @clean,$preprocess.$out."_clips_adapter.fq"; | |
127 $thrs[$i]=threads->create(\&clips,$in,$out); | |
128 } | |
129 for (my $i=0;$i<@thrs;$i++) { | |
130 $res[$i]=$thrs[$i]->join(); | |
131 } | |
132 } | |
133 } else { | |
134 # Do not processing using threads | |
135 print "Do not processing using threads\n"; | |
136 for (my $i=0;$i<@filein ;$i++) { | |
137 my $in=$filein[$i]; | |
138 my $out=$mark[$i]; | |
139 push @clean,$preprocess.$out."_clips_adapter.fq"; | |
140 &clips($in,$out); | |
141 } | |
142 } | |
143 | |
144 ##### clip adpter --> clean data end | |
145 | |
146 my $collapsed=$preprocess."collapse_reads.fa"; | |
147 my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data | |
148 my $data2; ### mirbase not mapped reads | |
149 my $data3; ### rfam not mapped reads | |
150 &collapse(\@clean,$collapsed); #collapse reads to tags | |
151 | |
152 &filterbylength(); # filter <$min_nt && >$max_nt | |
153 | |
154 print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; | |
155 | |
156 $time=Time(); | |
157 print "$time: known microRNA quantify!\n\n"; | |
158 | |
159 chdir $dir; | |
160 | |
161 $time=~s/:/-/g; | |
162 $time=~s/ /-/g; | |
163 my $known_result=$dir."miRNA_Express_${time}/"; | |
164 &quantify(); ### known microRAN quantify | |
165 | |
166 | |
167 #my $miR_exp_dir=&search($known_result,"miRNA_Express_"); | |
168 $data2=$known_result."/mirbase_not_mapped.fa"; | |
169 | |
170 my $pathfile="$dir/path.txt"; | |
171 open PA,">$pathfile"; | |
172 print PA "$config\n"; | |
173 print PA "$preprocess\n"; | |
174 print PA "$known_result\n"; | |
175 | |
176 if (defined $opts{'rfam'}) { #rfam mapping and analysis | |
177 $time=Time(); | |
178 print "$time: RNA annotate!\n\n"; | |
179 $time=~s/:/-/g; | |
180 $time=~s/ /-/g; | |
181 my $rfam_exp_dir=$dir."rfam_match_${time}"; | |
182 &rfam(); | |
183 #my $rfam_exp_dir=&search($dir,"rfam_match_"); | |
184 $data3=$rfam_exp_dir."/rfam_not_mapped.fa"; | |
185 print PA "$rfam_exp_dir\n"; | |
186 | |
187 my $tag=join "\\;" ,@mark; | |
188 system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); | |
189 } | |
190 | |
191 my $data4=$data; | |
192 if (defined $opts{'D'}) { #genome mapping | |
193 $data4=$data3; | |
194 }else{ | |
195 $data4=$data2; | |
196 } | |
197 | |
198 $time=Time(); | |
199 print "$time: Genome alignment!\n\n"; | |
200 $time=~s/:/-/g; | |
201 $time=~s/ /-/g; | |
202 my $genome_map=$dir."genome_match_${time}"; | |
203 &genome($data4); | |
204 print PA "$genome_map\n"; | |
205 #my $genome_map=&search($dir,"genome_match_"); | |
206 my $mapfile=$genome_map."/genome_mapped.bwt"; | |
207 my $mapfa=$genome_map."/genome_mapped.fa"; | |
208 my $unmap=$genome_map."/genome_not_mapped.fa"; | |
209 | |
210 #$time=Time(); | |
211 #print "$time: Novel microRNA prediction!\n\n"; | |
212 | |
213 &predict($mapfa); | |
214 | |
215 close PA; | |
216 system("perl $scipt_path/html.pl -i $pathfile -format $format -o $dir/result.html"); | |
217 | |
218 $time=Time(); | |
219 print "$time: Program end!!\n"; | |
220 | |
221 ############################## sub programs ################################### | |
222 sub predict{ | |
223 my ($file)=@_; | |
224 $time=&Time(); | |
225 print "$time: Novel microRNA prediction!\n\n"; | |
226 $time=~s/:/-/g; | |
227 $time=~s/ /-/g; | |
228 my $predict=$dir."miRNA_predict_${time}"; | |
229 print PA "$predict\n"; | |
230 mkdir $predict; | |
231 chdir $predict; | |
232 system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); | |
233 # print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; | |
234 | |
235 system("bowtie-build -f excised_precursor.fa excised_precursor"); | |
236 # print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; | |
237 | |
238 system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt"); | |
239 # print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; | |
240 | |
241 system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); | |
242 # print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; | |
243 | |
244 system("sort +3 -25 precursor_mapped.bst > signatures.bst"); | |
245 # print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; | |
246 | |
247 chdir $dir; | |
248 system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); | |
249 # print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; | |
250 system("rm novel_tmp_dir -rf"); | |
251 my $tag=join "," ,@mark; | |
252 system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); | |
253 } | |
254 | |
255 sub genome{ | |
256 my ($file)=@_; | |
257 if(defined $opts{'idx'}){ | |
258 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time") ; | |
259 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; | |
260 }else{ | |
261 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time") ; | |
262 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; | |
263 } | |
264 } | |
265 sub rfam{ | |
266 if (defined $opts{'idx2'}) { | |
267 system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time"); | |
268 # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; | |
269 }else{ | |
270 system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time"); | |
271 # print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; | |
272 } | |
273 } | |
274 sub quantify{ | |
275 my $tag=join "\\;" ,@mark; | |
276 system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); | |
277 # print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; | |
278 } | |
279 sub filterbylength{ | |
280 my $tmpmark=join ",", @mark; | |
281 system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); | |
282 # print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; | |
283 | |
284 } | |
285 sub collapse{ | |
286 my ($ins,$data)=@_; | |
287 my $str=""; | |
288 for (my $i=0;$i<@{$ins};$i++) { | |
289 $str .="-i $$ins[$i] "; | |
290 } | |
291 system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); | |
292 # print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; | |
293 } | |
294 | |
295 sub clips{ | |
296 my ($in,$out)=@_; | |
297 my $adapter=$preprocess.$out."_clips_adapter.fq"; | |
298 if($format eq "fq" || $format eq "fastq"){ | |
299 system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; | |
300 print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; | |
301 } | |
302 if($format eq "fa" || $format eq "fasta"){ | |
303 system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; | |
304 # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; | |
305 } | |
306 #my $clean=$preprocess.$out."_clean.fq"; | |
307 #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); | |
308 | |
309 return; | |
310 } | |
311 | |
312 sub read_config{ | |
313 open CON,"<$config"; | |
314 while (my $aline=<CON>) { | |
315 chomp $aline; | |
316 my @tmp=split/\t/,$aline; | |
317 push @filein,$tmp[0]; | |
318 push @mark,$tmp[1]; | |
319 &check_rawdata($tmp[0]); | |
320 } | |
321 close CON; | |
322 if (@filein != @mark) { | |
323 &printErr(); | |
324 die "Maybe config file have some wrong!!!\n"; | |
325 } | |
326 } | |
327 sub check_rawdata{ | |
328 my ($fileforcheck)=@_; | |
329 if (!(-s $fileforcheck)) { | |
330 &printErr(); | |
331 die "Can not find $fileforcheck, or file is empty!!!\n"; | |
332 } | |
333 if ($format eq "fasta" || $format eq "fa") { | |
334 &checkfa($fileforcheck); | |
335 } | |
336 if ($format eq "fastq" || $format eq "fq") { | |
337 &checkfq($fileforcheck); | |
338 } | |
339 } | |
340 sub checkfa{ | |
341 my ($file_reads)=@_; | |
342 open N,"<$file_reads"; | |
343 my $line=<N>; | |
344 chomp $line; | |
345 if($line !~ /^>\S+/){ | |
346 printErr(); | |
347 die "The first line of file $file_reads does not start with '>identifier' | |
348 Reads file $file_reads is not a valid fasta file\n\n"; | |
349 } | |
350 if(<N> !~ /^[ACGTNacgtn]*$/){ | |
351 printErr(); | |
352 die "File $file_reads contains not allowed characters in sequences | |
353 Allowed characters are ACGTN | |
354 Reads file $file_reads is not a fasta file\n\n"; | |
355 } | |
356 close N; | |
357 } | |
358 sub checkfq{ | |
359 my ($file_reads)=@_; | |
360 | |
361 open N,"<$file_reads"; | |
362 for (my $i=0;$i<10;$i++) { | |
363 my $a=<N>; | |
364 my $b=<N>; | |
365 my $c=<N>; | |
366 my $d=<N>; | |
367 chomp $a; | |
368 chomp $b; | |
369 chomp $c; | |
370 chomp $d; | |
371 if($a!~/^\@/){ | |
372 &printErr(); | |
373 die "$file_reads is not a fastq file\n\n"; | |
374 } | |
375 if($b!~ /^[ACGTNacgtn]*$/){ | |
376 &printErr(); | |
377 die "File $file_reads contains not allowed characters in sequences | |
378 Allowed characters are ACGTN | |
379 Reads file $file_reads is not a fasta file\n\n"; | |
380 } | |
381 if ($c!~/^\@/ && $c!~/^\+/) { | |
382 &printErr(); | |
383 die "$file_reads is not a fastq file\n\n"; | |
384 } | |
385 if ((length $b) != (length $d)) { | |
386 &printErr(); | |
387 die "$file_reads is not a fastq file\n\n"; | |
388 } | |
389 my @qv=split //,$d; | |
390 for (my $j=0;$j<@qv ;$j++) { | |
391 my $q=ord($qv[$j])-64; | |
392 if($q<0){$phred_qv=33;} | |
393 } | |
394 } | |
395 close N; | |
396 } | |
397 | |
398 sub search{ | |
399 my ($dir,$str)=@_; | |
400 opendir I,$dir; | |
401 my @ret; | |
402 while (my $file=readdir I) { | |
403 if ($file=~/$str/) { | |
404 push @ret, $file; | |
405 } | |
406 } | |
407 closedir I; | |
408 if (@ret != 1) { | |
409 &printErr(); | |
410 | |
411 die "Can not find directory or file which name has string: $str !!!\n"; | |
412 } | |
413 return $ret[0]; | |
414 } | |
415 | |
416 sub printErr{ | |
417 print STDERR color 'bold red'; | |
418 print STDERR "Error: "; | |
419 print STDERR color 'reset'; | |
420 } | |
421 =cut | |
422 sub Time{ | |
423 my $time=time(); | |
424 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; | |
425 $month++; | |
426 $year+=1900; | |
427 if (length($sec) == 1) {$sec = "0"."$sec";} | |
428 if (length($min) == 1) {$min = "0"."$min";} | |
429 if (length($hour) == 1) {$hour = "0"."$hour";} | |
430 if (length($day) == 1) {$day = "0"."$day";} | |
431 if (length($month) == 1) {$month = "0"."$month";} | |
432 #print "$year-$month-$day $hour:$min:$sec\n"; | |
433 return("$year-$month-$day-$hour-$min-$sec"); | |
434 } | |
435 =cut | |
436 sub Time{ | |
437 my $time=time(); | |
438 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; | |
439 $month++; | |
440 $year+=1900; | |
441 if (length($sec) == 1) {$sec = "0"."$sec";} | |
442 if (length($min) == 1) {$min = "0"."$min";} | |
443 if (length($hour) == 1) {$hour = "0"."$hour";} | |
444 if (length($day) == 1) {$day = "0"."$day";} | |
445 if (length($month) == 1) {$month = "0"."$month";} | |
446 #print "$year-$month-$day $hour:$min:$sec\n"; | |
447 return("$year-$month-$day $hour:$min:$sec"); | |
448 } | |
449 | |
450 | |
451 sub usage{ | |
452 print <<"USAGE"; | |
453 Version $version | |
454 Usage: | |
455 $0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path | |
456 options: | |
457 -i string, input file#input files information file | |
458 /path/filename mark | |
459 /path/filename mark | |
460 ... | |
461 | |
462 -format string,#specific input rawdata file format : fastq|fq|fasta|fa | |
463 | |
464 -gfa string, input file # genome fasta. sequence file | |
465 -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter | |
466 string must be the prefix of the bowtie index. For instance, if | |
467 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then | |
468 the prefix is 'h_sapiens_37_asm'.##can be null | |
469 | |
470 -pre string, input file #species specific microRNA precursor sequences | |
471 -mat string, input file #species specific microRNA mature sequences | |
472 | |
473 -rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. | |
474 -idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter | |
475 string must be the prefix of the bowtie index. For instance, if | |
476 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then | |
477 the prefix is 'h_sapiens_37_asm'.##can be null | |
478 | |
479 -D If [-D] is specified,will discard rfam mapped reads(nead -rfam). | |
480 | |
481 -a string, ADAPTER string. default is ATCTCGTATG. | |
482 -M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. | |
483 -min int, reads min length,default is 19. | |
484 -max int, reads max length,default is 28. | |
485 | |
486 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0 | |
487 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2 | |
488 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5 | |
489 -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment | |
490 -r int a read is allowed to map up to this number of positions in the genome,default is 25 | |
491 | |
492 -dis <int> Maximal space between miRNA and miRNA* (200) | |
493 -flank <int> Flank sequence length of miRNA precursor (10) | |
494 -mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) | |
495 | |
496 -t int, number of threads [1] | |
497 | |
498 -o output directory# absolute path | |
499 -h help | |
500 USAGE | |
501 exit(1); | |
502 } | |
503 |