comparison microRNA.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
comparison
equal deleted inserted replaced
49:f008ab2cadc6 50:7b5a48b972e9
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2014-4-22
6 #Modified:
7 #Description: plant microRNA prediction
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12 use threads;
13 #use threads::shared;
14 use File::Path;
15 use File::Basename;
16 #use RNA;
17 #use Term::ANSIColor;
18
19 my %opts;
20 GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");
21 if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments
22 &usage;
23 }
24
25 my $time=&Time();
26 print "miPlant program start:\n The time is $time!\n";
27 print "Command line:\n $0 @ARGV\n";
28
29 my $mypath=`pwd`;
30 chomp $mypath;
31
32 my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/";
33
34
35 unless ($dir=~/\/$/) {$dir.="/";}
36 if (not -d $dir) {
37 mkdir $dir;
38 }
39 my $config=$opts{'i'};
40 my $data=$opts{'fa'};
41
42 my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";
43
44 my $t=1; #threads number
45 if (defined $opts{'t'}) {$t=$opts{'t'};}
46
47 my $mis=0; #mismatch number for microRNA
48 if (defined $opts{'mis'}) {$mis=$opts{'mis'};}
49
50 my $hit=25; # maximum reads mapping hits in genome
51 if (defined $opts{'r'}) {$hit=$opts{'r'};}
52
53 my $upstream = 2; # microRNA 5' extension
54 $upstream = $opts{'e'} if(defined $opts{'e'});
55
56 my $downstream = 5;# microRNA 3' extension
57 $downstream = $opts{'f'} if(defined $opts{'f'});
58
59 my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200;
60 my $flank=defined $opts{'flank'} ? $opts{'flank'} :10;
61 my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20;
62
63 $time=&Time();
64 print "$time, Checking input file!\n";
65
66 my (@filein,@mark);
67 &read_config();
68
69 &checkfa($opts{pre}) if(defined $opts{pre});
70 &checkfa($opts{mat}) if(defined $opts{mat});
71 &checkfa($opts{gfa});
72
73 chdir $dir;
74 my $data2=$data;
75 my $known_result=$dir."known_miRNA_Express";
76 if(defined $opts{pre} and defined $opts{mat}){
77 &quantify(); ### known microRAN quantify
78 $data2=$known_result."/mirbase_not_mapped.fa";
79 }
80
81 my $genome_map=$dir."genome_match";
82 &genome($data2);
83
84 #my $genome_map=&search($dir,"genome_match_");
85 my $mapfile=$genome_map."/genome_mapped.bwt";
86 my $mapfa=$genome_map."/genome_mapped.fa";
87 my $unmap=$genome_map."/genome_not_mapped.fa";
88
89 #$time=Time();
90 #print "$time: Novel microRNA prediction!\n\n";
91
92 &predict($mapfa);
93
94 $time=Time();
95 print "$time: Program end!!\n";
96
97 ############################## sub programs ###################################
98 sub predict{
99 my ($file)=@_;
100 $time=&Time();
101 print "$time: Novel microRNA prediction!\n\n";
102 my $predict=$dir."Novel_miRNA_predict";
103 mkdir $predict;
104 chdir $predict;
105 system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");
106 # print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n";
107
108 system("bowtie-build -f excised_precursor.fa excised_precursor");
109 # print "\nbowtie-build -f excised_precursor.fa excised_precursor\n";
110
111 system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");
112 # print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n";
113
114 system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");
115 # print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n";
116
117 system("sort -k 4 precursor_mapped.bst > signatures.bst");
118 # print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n";
119
120 chdir $dir;
121 system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");
122 # print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n";
123 #system("rm novel_tmp_dir -rf");
124 my $tag=join "," ,@mark;
125 system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag");
126
127 system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");
128
129 }
130
131 sub genome{
132 my ($file)=@_;
133 if(defined $opts{'idx'}){
134 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;
135 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
136 }else{
137 system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;
138 # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
139 }
140 }
141
142 sub quantify{
143 my $tag=join "\\;" ,@mark;
144 system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");
145 print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n";
146 }
147
148 sub read_config{
149 open CON,"<$config";
150 while (my $aline=<CON>) {
151 chomp $aline;
152 my @tmp=split/\t/,$aline;
153 push @filein,$tmp[0];
154 push @mark,$tmp[1];
155 #&check_rawdata($tmp[0]);
156 }
157 close CON;
158 if (@filein != @mark) {
159 #&printErr();
160 die "Maybe config file have some wrong!!!\n";
161 }
162 }
163 sub checkfa{
164 my ($file_reads)=@_;
165 open N,"<$file_reads";
166 my $line=<N>;
167 chomp $line;
168 if($line !~ /^>\S+/){
169 #printErr();
170 die "The first line of file $file_reads does not start with '>identifier'
171 Reads file $file_reads is not a valid fasta file\n\n";
172 }
173 if(<N> !~ /^[ACGTNacgtn]*$/){
174 #printErr();
175 die "File $file_reads contains not allowed characters in sequences
176 Allowed characters are ACGTN
177 Reads file $file_reads is not a fasta file\n\n";
178 }
179 close N;
180 }
181 sub search{
182 my ($dir,$str)=@_;
183 opendir I,$dir;
184 my @ret;
185 while (my $file=readdir I) {
186 if ($file=~/$str/) {
187 push @ret, $file;
188 }
189 }
190 closedir I;
191 if (@ret != 1) {
192 #&printErr();
193
194 die "Can not find directory or file which name has string: $str !!!\n";
195 }
196 return $ret[0];
197 }
198
199
200 sub Time{
201 my $time=time();
202 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
203 $month++;
204 $year+=1900;
205 if (length($sec) == 1) {$sec = "0"."$sec";}
206 if (length($min) == 1) {$min = "0"."$min";}
207 if (length($hour) == 1) {$hour = "0"."$hour";}
208 if (length($day) == 1) {$day = "0"."$day";}
209 if (length($month) == 1) {$month = "0"."$month";}
210 #print "$year-$month-$day $hour:$min:$sec\n";
211 return("$year-$month-$day $hour:$min:$sec");
212 }
213
214
215 sub usage{
216 print <<"USAGE";
217 Version $version
218 Usage:
219
220 $0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path
221 options:
222 -i input files, # config
223
224 -fa ,#fasta sequence file
225
226 -path scirpt path
227
228 -gfa string, input file # genome fasta. sequence file
229 -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
230 string must be the prefix of the bowtie index. For instance, if
231 the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
232 the prefix is 'h_sapiens_37_asm'.##can be null
233
234 -pre string, input file #species specific microRNA precursor sequences
235 -mat string, input file #species specific microRNA mature sequences
236
237 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0
238 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2
239 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5
240 -r int a read is allowed to map up to this number of positions in the genome,default is 25
241
242 -dis <int> Maximal space between miRNA and miRNA* (200)
243 -flank <int> Flank sequence length of miRNA precursor (10)
244 -mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)
245
246 -t int, number of threads [1]
247
248 -o output directory# absolute path
249 -h help
250 USAGE
251 exit(1);
252 }
253