annotate miRNA_Express_and_sequence.pl @ 53:f5a2e8308836 draft default tip

Uploaded
author big-tiandm
date Mon, 08 Dec 2014 01:51:16 -0500
parents d1cc2e6ecf90
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl -w
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
2 #Filename:
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
3 #Author: Tian Dongmei
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
4 #Email: tiandm@big.ac.cn
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
5 #Date: 2014-6-4
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
6 #Modified:
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
7 #Description: solexa miRNA express and sequence
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
8 my $version=1.00;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
9
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
10 use strict;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
11 use Getopt::Long;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
12
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
13 my %opts;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
14 GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h");
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
15 if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
16 &usage;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
17 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
18
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
19 my $filein=$opts{'i'};
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
20 my $fileout=$opts{'list'};
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
21 my $out=$opts{'fa'};
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
22 my $preout=$opts{'pre'};
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
23
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
24 =cut
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
25 my %hash_pri;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
26 open PRI,"<$opts{p}";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
27 while (my $aline=<PRI>) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
28 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
29 if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;}
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
30 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
31 close PRI;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
32 =cut
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
33
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
34 open IN,"<$filein"; #input file
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
35 open OUT,">$fileout"; #output file
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
36 open FA ,">$out";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
37 open PRE,">$preout";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
38
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
39 print OUT "#ID\tcoordinate\tpos1\tpos2";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
40 my @marks=split/\,/,$opts{'tag'};
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
41 foreach (@marks) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
42 print OUT "\t",$_,"_matureExp";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
43 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
44 foreach (@marks) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
45 print OUT "\t",$_,"_starExp";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
46 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
47 foreach (@marks) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
48 print OUT "\t",$_,"_totalExp";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
49 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
50
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
51 print OUT "\n";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
52
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
53 my (%uniq_id,$novel);
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
54 while (my $aline=<IN>) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
55 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
56 until ($aline =~ /^score\s+[-\d\.]+/){
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
57 $aline = <IN>;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
58 if (eof) {last;}
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
59 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
60 if (eof) {last;}
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
61 ########## miRNA ID ################
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
62 $novel++;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
63 ########### annotate####################
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
64 do {$aline=<IN>;} until($aline=~/flank_first_end/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
65 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
66 my @flank1=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
67 do {$aline=<IN>;} until($aline=~/flank_second_beg/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
68 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
69 my @flank2=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
70 #
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
71 ########## mature start loop pre ####
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
72 do {$aline=<IN>;} until($aline=~/mature_beg/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
73 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
74 my @start=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
75 # $start[1] -=$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
76 do {$aline=<IN>;} until($aline=~/mature_end/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
77 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
78 my @end=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
79 # $end[1] -=$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
80 do {$aline=<IN>;} until($aline=~/mature_seq/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
81 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
82 my @arr1=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
83 do {$aline=<IN>;} until($aline=~/pre_seq/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
84 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
85 my @arr2=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
86 do {$aline=<IN>;} until($aline=~/pri_id/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
87 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
88 my @pri_id=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
89 do {$aline=<IN>;} until($aline=~/pri_seq/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
90 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
91 my @pri_seq=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
92 do {$aline=<IN>;} until($aline=~/star_beg/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
93 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
94 my @star_start=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
95 # $star_start[1] -=$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
96 do {$aline=<IN>;} until($aline=~/star_end/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
97 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
98 my @star_end=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
99 # $star_end[1] -=$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
100 do {$aline=<IN>;} until($aline=~/star_seq/) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
101 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
102 my @arr3=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
103 print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
104 #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
105 print FA ">miR-c-$novel\n$arr1[1]\n";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
106 print PRE ">miR-c-$novel\n$pri_seq[1]\n";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
107 ########## reads count #############
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
108 <IN>;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
109 my @count1;my @count2;my @count3;my @count4;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
110 $aline=<IN>;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
111 do {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
112 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
113 my @reads=split/\t/,$aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
114 my @pos=();
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
115 $reads[5]=~/(\d+)\.\.(\d+)/;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
116 # $pos[0] =$1-$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
117 # $pos[1] =$2-$flank1[1];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
118 $pos[0]=$1;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
119 $pos[1]=$2;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
120 $reads[0]=~/:([\d|_]+)_x(\d+)$/;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
121 my @ss=split/_/,$1;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
122 for (my $i=0;$i<@ss ;$i++) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
123 if (!(defined $count3[$i])) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
124 $count3[$i]=0;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
125 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
126 if (!(defined $count4[$i])) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
127 $count4[$i]=0;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
128 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
129 $count2[$i]+=$ss[$i];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
130
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
131 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
132 # $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 );
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
133 # $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 );
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
134 # $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1);
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
135 # $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1);
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
136 if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 )
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
137 {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
138 for (my $i=0;$i<@ss;$i++) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
139 $count3[$i]+=$ss[$i];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
140 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
141 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
142 if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
143 for (my $i=0;$i<@ss;$i++) {
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
144 $count4[$i]+=$ss[$i];
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
145 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
146 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
147 $aline=<IN>;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
148 chomp $aline;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
149 } until(length $aline < 1) ;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
150 $"="\t";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
151 print OUT "@count3\t@count4\t@count2\n";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
152 $"=" ";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
153 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
154
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
155 close IN;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
156 close OUT;
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
157
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
158 sub usage{
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
159 print <<"USAGE";
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
160 Version $version
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
161 Usage:
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
162 $0 -i -list -fa -pre -tag
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
163 options:
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
164 -i input file,predictions file
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
165 -list output file miRNA list file
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
166 -fa output file ,miRNA sequence fasta file.
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
167 -pre output file, miRNA precursor fasta file.
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
168 -tag string, sample names# eg: samA,samB,samC
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
169 -h help
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
170 USAGE
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
171 exit(1);
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
172 }
d1cc2e6ecf90 Uploaded
big-tiandm
parents:
diff changeset
173