Mercurial > repos > big-tiandm > mirplant2
comparison microRNA.pl @ 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
49:f008ab2cadc6 | 50:7b5a48b972e9 |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Tian Dongmei | |
4 #Email: tiandm@big.ac.cn | |
5 #Date: 2014-4-22 | |
6 #Modified: | |
7 #Description: plant microRNA prediction | |
8 my $version=1.00; | |
9 | |
10 use strict; | |
11 use Getopt::Long; | |
12 use threads; | |
13 #use threads::shared; | |
14 use File::Path; | |
15 use File::Basename; | |
16 #use RNA; | |
17 #use Term::ANSIColor; | |
18 | |
19 my %opts; | |
20 GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h"); | |
21 if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments | |
22 &usage; | |
23 } | |
24 | |
25 my $time=&Time(); | |
26 print "miPlant program start:\n The time is $time!\n"; | |
27 print "Command line:\n $0 @ARGV\n"; | |
28 | |
29 my $mypath=`pwd`; | |
30 chomp $mypath; | |
31 | |
32 my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/"; | |
33 | |
34 | |
35 unless ($dir=~/\/$/) {$dir.="/";} | |
36 if (not -d $dir) { | |
37 mkdir $dir; | |
38 } | |
39 my $config=$opts{'i'}; | |
40 my $data=$opts{'fa'}; | |
41 | |
42 my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; | |
43 | |
44 my $t=1; #threads number | |
45 if (defined $opts{'t'}) {$t=$opts{'t'};} | |
46 | |
47 my $mis=0; #mismatch number for microRNA | |
48 if (defined $opts{'mis'}) {$mis=$opts{'mis'};} | |
49 | |
50 my $hit=25; # maximum reads mapping hits in genome | |
51 if (defined $opts{'r'}) {$hit=$opts{'r'};} | |
52 | |
53 my $upstream = 2; # microRNA 5' extension | |
54 $upstream = $opts{'e'} if(defined $opts{'e'}); | |
55 | |
56 my $downstream = 5;# microRNA 3' extension | |
57 $downstream = $opts{'f'} if(defined $opts{'f'}); | |
58 | |
59 my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; | |
60 my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; | |
61 my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; | |
62 | |
63 $time=&Time(); | |
64 print "$time, Checking input file!\n"; | |
65 | |
66 my (@filein,@mark); | |
67 &read_config(); | |
68 | |
69 &checkfa($opts{pre}) if(defined $opts{pre}); | |
70 &checkfa($opts{mat}) if(defined $opts{mat}); | |
71 &checkfa($opts{gfa}); | |
72 | |
73 chdir $dir; | |
74 my $data2=$data; | |
75 my $known_result=$dir."known_miRNA_Express"; | |
76 if(defined $opts{pre} and defined $opts{mat}){ | |
77 &quantify(); ### known microRAN quantify | |
78 $data2=$known_result."/mirbase_not_mapped.fa"; | |
79 } | |
80 | |
81 my $genome_map=$dir."genome_match"; | |
82 &genome($data2); | |
83 | |
84 #my $genome_map=&search($dir,"genome_match_"); | |
85 my $mapfile=$genome_map."/genome_mapped.bwt"; | |
86 my $mapfa=$genome_map."/genome_mapped.fa"; | |
87 my $unmap=$genome_map."/genome_not_mapped.fa"; | |
88 | |
89 #$time=Time(); | |
90 #print "$time: Novel microRNA prediction!\n\n"; | |
91 | |
92 &predict($mapfa); | |
93 | |
94 $time=Time(); | |
95 print "$time: Program end!!\n"; | |
96 | |
97 ############################## sub programs ################################### | |
98 sub predict{ | |
99 my ($file)=@_; | |
100 $time=&Time(); | |
101 print "$time: Novel microRNA prediction!\n\n"; | |
102 my $predict=$dir."Novel_miRNA_predict"; | |
103 mkdir $predict; | |
104 chdir $predict; | |
105 system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); | |
106 # print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; | |
107 | |
108 system("bowtie-build -f excised_precursor.fa excised_precursor"); | |
109 # print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; | |
110 | |
111 system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); | |
112 # print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; | |
113 | |
114 system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); | |
115 # print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; | |
116 | |
117 system("sort -k 4 precursor_mapped.bst > signatures.bst"); | |
118 # print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; | |
119 | |
120 chdir $dir; | |
121 system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); | |
122 # print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; | |
123 #system("rm novel_tmp_dir -rf"); | |
124 my $tag=join "," ,@mark; | |
125 system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); | |
126 | |
127 system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa"); | |
128 | |
129 } | |
130 | |
131 sub genome{ | |
132 my ($file)=@_; | |
133 if(defined $opts{'idx'}){ | |
134 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; | |
135 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; | |
136 }else{ | |
137 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; | |
138 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; | |
139 } | |
140 } | |
141 | |
142 sub quantify{ | |
143 my $tag=join "\\;" ,@mark; | |
144 system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); | |
145 print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; | |
146 } | |
147 | |
148 sub read_config{ | |
149 open CON,"<$config"; | |
150 while (my $aline=<CON>) { | |
151 chomp $aline; | |
152 my @tmp=split/\t/,$aline; | |
153 push @filein,$tmp[0]; | |
154 push @mark,$tmp[1]; | |
155 #&check_rawdata($tmp[0]); | |
156 } | |
157 close CON; | |
158 if (@filein != @mark) { | |
159 #&printErr(); | |
160 die "Maybe config file have some wrong!!!\n"; | |
161 } | |
162 } | |
163 sub checkfa{ | |
164 my ($file_reads)=@_; | |
165 open N,"<$file_reads"; | |
166 my $line=<N>; | |
167 chomp $line; | |
168 if($line !~ /^>\S+/){ | |
169 #printErr(); | |
170 die "The first line of file $file_reads does not start with '>identifier' | |
171 Reads file $file_reads is not a valid fasta file\n\n"; | |
172 } | |
173 if(<N> !~ /^[ACGTNacgtn]*$/){ | |
174 #printErr(); | |
175 die "File $file_reads contains not allowed characters in sequences | |
176 Allowed characters are ACGTN | |
177 Reads file $file_reads is not a fasta file\n\n"; | |
178 } | |
179 close N; | |
180 } | |
181 sub search{ | |
182 my ($dir,$str)=@_; | |
183 opendir I,$dir; | |
184 my @ret; | |
185 while (my $file=readdir I) { | |
186 if ($file=~/$str/) { | |
187 push @ret, $file; | |
188 } | |
189 } | |
190 closedir I; | |
191 if (@ret != 1) { | |
192 #&printErr(); | |
193 | |
194 die "Can not find directory or file which name has string: $str !!!\n"; | |
195 } | |
196 return $ret[0]; | |
197 } | |
198 | |
199 | |
200 sub Time{ | |
201 my $time=time(); | |
202 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; | |
203 $month++; | |
204 $year+=1900; | |
205 if (length($sec) == 1) {$sec = "0"."$sec";} | |
206 if (length($min) == 1) {$min = "0"."$min";} | |
207 if (length($hour) == 1) {$hour = "0"."$hour";} | |
208 if (length($day) == 1) {$day = "0"."$day";} | |
209 if (length($month) == 1) {$month = "0"."$month";} | |
210 #print "$year-$month-$day $hour:$min:$sec\n"; | |
211 return("$year-$month-$day $hour:$min:$sec"); | |
212 } | |
213 | |
214 | |
215 sub usage{ | |
216 print <<"USAGE"; | |
217 Version $version | |
218 Usage: | |
219 | |
220 $0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path | |
221 options: | |
222 -i input files, # config | |
223 | |
224 -fa ,#fasta sequence file | |
225 | |
226 -path scirpt path | |
227 | |
228 -gfa string, input file # genome fasta. sequence file | |
229 -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter | |
230 string must be the prefix of the bowtie index. For instance, if | |
231 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then | |
232 the prefix is 'h_sapiens_37_asm'.##can be null | |
233 | |
234 -pre string, input file #species specific microRNA precursor sequences | |
235 -mat string, input file #species specific microRNA mature sequences | |
236 | |
237 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0 | |
238 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2 | |
239 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5 | |
240 -r int a read is allowed to map up to this number of positions in the genome,default is 25 | |
241 | |
242 -dis <int> Maximal space between miRNA and miRNA* (200) | |
243 -flank <int> Flank sequence length of miRNA precursor (10) | |
244 -mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) | |
245 | |
246 -t int, number of threads [1] | |
247 | |
248 -o output directory# absolute path | |
249 -h help | |
250 USAGE | |
251 exit(1); | |
252 } | |
253 |