Mercurial > repos > amadeo > amadeo
comparison Tools/Second/remove_motifs_galaxy.pl @ 0:229d36377838 draft
Uploaded
author | amadeo |
---|---|
date | Mon, 05 Sep 2016 05:53:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:229d36377838 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 $|=1; | |
4 use warnings; | |
5 use strict; | |
6 #Script that takes a gff format file from step2.pl as input and compares contiguous motifs listed in the gff file. | |
7 #If motifs overlap and surpass the threshold, then it will remove that motif with the highest p value. | |
8 | |
9 my $line; | |
10 my @cols; | |
11 my %hash; | |
12 my %hash_negative; | |
13 my $gene; | |
14 my @sequences; | |
15 my $seq_len; | |
16 my $OL; | |
17 my @output_pos; | |
18 my @output_neg; | |
19 my $actual_pvalue; | |
20 my $actual_pvalue_neg; | |
21 my $pvalue; | |
22 my $pvalue_neg; | |
23 | |
24 | |
25 if(@ARGV < 4){ | |
26 print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n"; | |
27 exit(0); | |
28 } | |
29 | |
30 | |
31 | |
32 open(FIMO, "<$ARGV[0]") || | |
33 die "File '$ARGV[0]' not found\n"; | |
34 open(POSITIVE, ">$ARGV[1]") || | |
35 die "File '>$ARGV[1]' not found\n"; | |
36 open(NEGATIVE, ">$ARGV[2]") || | |
37 die "File '>$ARGV[2]' not found\n"; | |
38 | |
39 # Getting overlap value form user and testing to see if it's 0-100 and | |
40 # converting to 0-1 scale. | |
41 if ($ARGV[3] >0.0 && $ARGV[3] <=100){ | |
42 $OL=$ARGV[3]/100; | |
43 } | |
44 else{ | |
45 print" ERROR: overlap is a value 0-100\n"; | |
46 exit(0); | |
47 } | |
48 #print "OL is $OL\n"; | |
49 | |
50 while (<FIMO>) { | |
51 $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents | |
52 chomp $line; #avoid \n on last field | |
53 @cols=split;#Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context. | |
54 #This is very useful because the data of the gff file can be called using this variable. | |
55 my $pos1; | |
56 my $pos2; | |
57 my $scalar; | |
58 my $decimal; | |
59 my $e; | |
60 | |
61 my @list=(); | |
62 if ($line=~/^#/){ | |
63 printf POSITIVE"%s\n", $line; | |
64 printf NEGATIVE"%s\n", $line; | |
65 } | |
66 elsif ($line!~/^##/ and $cols[6]eq"+") { | |
67 @cols=split; | |
68 #$TF= substr $cols[8],5,8; #in this case we don't need that the hash considers the motif | |
69 $gene=substr $cols[0],0,21; | |
70 $pos1 = $cols[3]; #start position of the motif | |
71 $pos2=$cols[4]; #end position of the motif | |
72 @list=(); | |
73 @list=($pos1,$pos2); | |
74 @sequences= split( "=", $cols[9]); | |
75 $seq_len = int(length (substr $sequences[1],0,-1)); #returns the length of the sequence | |
76 ####These variables consider the p value#### | |
77 $decimal= substr $cols[8],-16,4; | |
78 $e=substr $cols[8],-11,3; | |
79 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. | |
80 $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line | |
81 ####====### | |
82 if (not exists $hash{$gene}) { #Every time that a block of a gene with all the different motifs starts, it will register | |
83 #the gene in a hash: gene as a key and pos1 and pos2 as values. | |
84 $hash{$gene}=\@list; | |
85 $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop | |
86 push @output_pos, $line; #it saves the information of the gene motif in the array | |
87 } | |
88 | |
89 elsif (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) | |
90 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) {#if the gene exists and the motif is not overlaped | |
91 #with the previous one | |
92 #then it will take the line in the list and it will | |
93 #consider the p value in the next loop | |
94 $hash{$gene}=\@list; | |
95 $pvalue=$actual_pvalue; | |
96 push @output_pos, $line; | |
97 } | |
98 | |
99 | |
100 elsif ( | |
101 | |
102 (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and | |
103 ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)<$OL) | |
104 | |
105 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position | |
106 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. | |
107 #It will store it in the array and its p value it will consider in the next loop. | |
108 $hash{$gene}=\@list; | |
109 $pvalue=$actual_pvalue; | |
110 push @output_pos, $line; | |
111 #print $pvalue , "\n"; | |
112 } | |
113 elsif ( | |
114 | |
115 (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and | |
116 ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)>$OL) | |
117 and $actual_pvalue<$pvalue | |
118 | |
119 | |
120 ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second | |
121 #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value | |
122 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif | |
123 #with the lowest p value. This p value will consider in the next loop. | |
124 pop @output_pos; | |
125 $hash{$gene}=\@list; | |
126 $pvalue=$actual_pvalue; | |
127 push @output_pos, $line; | |
128 #print $pvalue , "\n"; | |
129 } | |
130 elsif ( | |
131 | |
132 ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)<$OL ) | |
133 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) | |
134 | |
135 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position | |
136 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. | |
137 #It will store it in the array and its p value it will consider in the next loop. | |
138 | |
139 $hash{$gene}=\@list; | |
140 $pvalue=$actual_pvalue; | |
141 push @output_pos, $line; | |
142 } | |
143 elsif ( | |
144 | |
145 ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)>$OL ) | |
146 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue | |
147 #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first | |
148 #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value | |
149 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif | |
150 #with the lowest p value. This p value will consider in the next loop. | |
151 ) { | |
152 $hash{$gene}=\@list; | |
153 $pvalue=$actual_pvalue; | |
154 pop @output_pos; | |
155 push @output_pos, $line; | |
156 } | |
157 elsif ( | |
158 | |
159 (($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) | |
160 and ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue | |
161 | |
162 ) { | |
163 $hash{$gene}=\@list; | |
164 $pvalue=$actual_pvalue; | |
165 pop @output_pos; | |
166 push @output_pos, $line; | |
167 } | |
168 | |
169 | |
170 } | |
171 | |
172 ##===========Same strategy applied to the motifs located in the minus strand===========# | |
173 elsif ($line!~/^##/ and $cols[6]eq"-") { | |
174 @cols=split; | |
175 #$TF= substr $cols[8],5,8; | |
176 $gene=substr $cols[0],0,21; | |
177 $pos1 = $cols[3]; | |
178 $pos2=$cols[4]; | |
179 @list=(); | |
180 @list=($pos1,$pos2); | |
181 @sequences= split( "=", $cols[9]); | |
182 $seq_len = int(length (substr $sequences[1],0,-1)); | |
183 $decimal= substr $cols[8],-16,4; | |
184 $e=substr $cols[8],-11,3; | |
185 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. | |
186 $actual_pvalue_neg=$decimal*(10**$e); | |
187 | |
188 if (not exists $hash_negative{$gene}) { | |
189 $hash_negative{$gene}=\@list; | |
190 $pvalue_neg=$actual_pvalue_neg; | |
191 push @output_neg, $line; | |
192 } | |
193 | |
194 elsif (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) | |
195 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1])) { | |
196 $pvalue_neg=$actual_pvalue_neg; | |
197 $hash_negative{$gene}=\@list; | |
198 push @output_neg, $line; | |
199 } | |
200 | |
201 | |
202 elsif ( | |
203 | |
204 (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1])and | |
205 ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)<$OL ) | |
206 ) { | |
207 $pvalue_neg=$actual_pvalue_neg; | |
208 $hash_negative{$gene}=\@list; | |
209 push @output_neg, $line; | |
210 } | |
211 elsif ( | |
212 | |
213 (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and | |
214 ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)>$OL and | |
215 $actual_pvalue_neg<$pvalue_neg) | |
216 ) { | |
217 $pvalue=$actual_pvalue_neg; | |
218 $hash_negative{$gene}=\@list; | |
219 pop @output_neg; | |
220 push @output_neg, $line; | |
221 } | |
222 elsif ( | |
223 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)<$OL ) | |
224 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )) | |
225 ) { | |
226 $pvalue_neg=$actual_pvalue_neg; | |
227 $hash_negative{$gene}=\@list; | |
228 push @output_neg, $line; | |
229 } | |
230 elsif ( | |
231 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and | |
232 (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)>$OL ) | |
233 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and | |
234 $actual_pvalue_neg<$pvalue_neg) | |
235 ) { | |
236 $pvalue_neg=$actual_pvalue_neg; | |
237 $hash_negative{$gene}=\@list; | |
238 pop @output_neg; | |
239 push @output_neg, $line; | |
240 } | |
241 | |
242 elsif ( | |
243 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) ) | |
244 and ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and | |
245 $actual_pvalue_neg<$pvalue_neg) | |
246 ) { | |
247 $pvalue_neg=$actual_pvalue_neg; | |
248 $hash_negative{$gene}=\@list; | |
249 pop @output_neg; | |
250 push @output_neg, $line; | |
251 } | |
252 | |
253 | |
254 } | |
255 } | |
256 foreach my $lines_pos (@output_pos){ | |
257 printf POSITIVE"%s\n", $lines_pos; | |
258 | |
259 } | |
260 foreach my $lines_neg (@output_neg){ | |
261 printf NEGATIVE"%s\n", $lines_neg; | |
262 } |