Mercurial > repos > amadeo > amadeo
comparison Tools/First_version/rm_overlap_motifs_galaxy.pl @ 3:b30ba2b06326 draft
Uploaded
author | amadeo |
---|---|
date | Mon, 05 Sep 2016 06:01:48 -0400 |
parents | 229d36377838 |
children |
comparison
equal
deleted
inserted
replaced
2:607d7bdfb07f | 3:b30ba2b06326 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 $|=1; | |
4 use warnings; | |
5 use strict; | |
6 | |
7 | |
8 my $line; | |
9 my @cols; | |
10 my %hash; | |
11 my %hash_negative; | |
12 my $gene; | |
13 my $TF; | |
14 my @sequences; | |
15 my $seq_len; | |
16 my $OL; | |
17 my @output_pos; | |
18 my @output_neg; | |
19 my $actual_pvalue; | |
20 my $pvalue; | |
21 my $pvalue_neg; | |
22 | |
23 | |
24 #$ARGV[3]=<STDIN>; | |
25 | |
26 if(@ARGV < 4){ | |
27 print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n"; | |
28 exit(0); | |
29 } | |
30 | |
31 | |
32 | |
33 open(FIMO, "<$ARGV[0]") || | |
34 die "File '$ARGV[0]' not found\n"; | |
35 open(POSITIVE, ">$ARGV[1]") || | |
36 die "File '>$ARGV[1]' not found\n"; | |
37 open(NEGATIVE, ">$ARGV[2]") || | |
38 die "File '>$ARGV[2]' not found\n"; | |
39 | |
40 # Getting overlap value form user and testing to see if its 0-100 and | |
41 # converting to 0-1 scale. | |
42 if ($ARGV[3] >0.0 && $ARGV[3] <=100){ | |
43 $OL=$ARGV[3]/100; | |
44 } | |
45 else{ | |
46 print" ERROR: overlap is a value 0-100\n"; | |
47 exit(0); | |
48 } | |
49 #print "OL is $OL\n"; | |
50 | |
51 while (<FIMO>) { | |
52 $line=$_; | |
53 chomp $line; | |
54 @cols=split; | |
55 my $pos1; | |
56 my $pos2; | |
57 my $scalar; | |
58 my $decimal; | |
59 my $e; | |
60 | |
61 my @list=(); | |
62 if ($line=~/^#/){ | |
63 printf POSITIVE"%s\n", $line; | |
64 printf NEGATIVE"%s\n", $line; | |
65 } | |
66 elsif ($line!~/^##/ and $cols[6]eq"+") { | |
67 @cols=split; | |
68 $TF= substr $cols[8],5,8; | |
69 $gene=substr $cols[0],0,21; | |
70 $pos1 = $cols[3]; | |
71 $pos2=$cols[4]; | |
72 @list=(); | |
73 @list=($pos1,$pos2); | |
74 @sequences= split( "=", $cols[9]); | |
75 $seq_len = int(length (substr $sequences[1],0,-1)); | |
76 $decimal= substr $cols[8],-16,4; | |
77 $e=substr $cols[8],-11,3; | |
78 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. | |
79 $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line | |
80 | |
81 if (not exists $hash{$gene}{$TF}) { #Every time that a block of a GENE-MOTIF starts, it will register | |
82 #the GENE-MOTIF in a hash: GENE-MOTIF as a key and pos1 and pos2 as values. | |
83 $hash{$gene}{$TF}=\@list; | |
84 $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop | |
85 push @output_pos, $line; #it saves the information of the gene motif in the array | |
86 } | |
87 | |
88 elsif (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) | |
89 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) {#if the gene exists and the motif is not overlaped | |
90 #with the previous one | |
91 #then it will take the line in the list and it will | |
92 #consider the p value in the next loop | |
93 $hash{$gene}{$TF}=\@list; | |
94 $pvalue=$actual_pvalue; | |
95 push @output_pos, $line; | |
96 } | |
97 | |
98 | |
99 elsif ( | |
100 | |
101 (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and | |
102 ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)<$OL) | |
103 | |
104 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position | |
105 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. | |
106 #It will store it in the array and its p value it will consider in the next loop. | |
107 $hash{$gene}{$TF}=\@list; | |
108 $pvalue=$actual_pvalue; | |
109 push @output_pos, $line; | |
110 #print $pvalue , "\n"; | |
111 } | |
112 elsif ( | |
113 | |
114 (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and | |
115 ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)>$OL) | |
116 and $actual_pvalue<$pvalue | |
117 | |
118 | |
119 ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second | |
120 #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value | |
121 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif | |
122 #with the lowest p value. This p value will consider in the next loop. | |
123 pop @output_pos; | |
124 $hash{$gene}{$TF}=\@list; | |
125 $pvalue=$actual_pvalue; | |
126 push @output_pos, $line; | |
127 #print $pvalue , "\n"; | |
128 } | |
129 elsif ( | |
130 | |
131 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) | |
132 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) | |
133 | |
134 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position | |
135 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line. | |
136 #It will store it in the array and its p value it will consider in the next loop. | |
137 | |
138 $hash{$gene}{$TF}=\@list; | |
139 $pvalue=$actual_pvalue; | |
140 push @output_pos, $line; | |
141 } | |
142 elsif ( | |
143 | |
144 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) | |
145 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue | |
146 #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first | |
147 #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value | |
148 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif | |
149 #with the lowest p value. This p value will consider in the next loop. | |
150 ) { | |
151 $hash{$gene}{$TF}=\@list; | |
152 $pvalue=$actual_pvalue; | |
153 pop @output_pos; | |
154 push @output_pos, $line; | |
155 } | |
156 elsif ( | |
157 | |
158 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) ) | |
159 and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue | |
160 | |
161 ) { | |
162 $hash{$gene}{$TF}=\@list; | |
163 $pvalue=$actual_pvalue; | |
164 pop @output_pos; | |
165 push @output_pos, $line; | |
166 } | |
167 | |
168 | |
169 } | |
170 elsif ($line!~/^##/ and $cols[6]eq"-") { #same strategy for the motifs located in the minus strand | |
171 @cols=split; | |
172 #$TF= substr $cols[8],5,8; | |
173 $gene=substr $cols[0],0,21; | |
174 $pos1 = $cols[3]; | |
175 $pos2=$cols[4]; | |
176 @list=(); | |
177 @list=($pos1,$pos2); | |
178 @sequences= split( "=", $cols[9]); | |
179 $seq_len = int(length (substr $sequences[1],0,-1)); | |
180 $decimal= substr $cols[8],-16,4; | |
181 $e=substr $cols[8],-11,3; | |
182 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string. | |
183 $actual_pvalue=$decimal*(10**$e); | |
184 | |
185 if (not exists $hash_negative{$gene}{$TF}) { | |
186 $hash_negative{$gene}{$TF}=\@list; | |
187 $pvalue_neg=$actual_pvalue; | |
188 push @output_neg, $line; | |
189 } | |
190 | |
191 elsif (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) | |
192 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1])) { | |
193 $pvalue_neg=$actual_pvalue; | |
194 $hash_negative{$gene}{$TF}=\@list; | |
195 push @output_neg, $line; | |
196 } | |
197 | |
198 | |
199 elsif ( | |
200 | |
201 (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])and | |
202 ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)<$OL ) | |
203 ) { | |
204 $pvalue_neg=$actual_pvalue; | |
205 $hash_negative{$gene}{$TF}=\@list; | |
206 push @output_neg, $line; | |
207 } | |
208 elsif ( | |
209 | |
210 (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and | |
211 ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)>$OL and | |
212 $actual_pvalue<$pvalue_neg) | |
213 ) { | |
214 $pvalue=$actual_pvalue; | |
215 $hash_negative{$gene}{$TF}=\@list; | |
216 pop @output_neg; | |
217 push @output_neg, $line; | |
218 } | |
219 elsif ( | |
220 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL ) | |
221 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )) | |
222 ) { | |
223 $pvalue_neg=$actual_pvalue; | |
224 $hash_negative{$gene}{$TF}=\@list; | |
225 push @output_neg, $line; | |
226 } | |
227 elsif ( | |
228 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and | |
229 (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL ) | |
230 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and | |
231 $actual_pvalue<$pvalue_neg) | |
232 ) { | |
233 $pvalue_neg=$actual_pvalue; | |
234 $hash_negative{$gene}{$TF}=\@list; | |
235 pop @output_neg; | |
236 push @output_neg, $line; | |
237 } | |
238 elsif ( | |
239 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])) | |
240 and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and | |
241 $actual_pvalue<$pvalue_neg) | |
242 ) { | |
243 $pvalue_neg=$actual_pvalue; | |
244 $hash_negative{$gene}{$TF}=\@list; | |
245 pop @output_neg; | |
246 push @output_neg, $line; | |
247 } | |
248 | |
249 | |
250 } | |
251 } | |
252 foreach my $lines_pos (@output_pos){ | |
253 printf POSITIVE"%s\n", $lines_pos; | |
254 | |
255 } | |
256 foreach my $lines_neg (@output_neg){ | |
257 printf NEGATIVE"%s\n", $lines_neg; | |
258 } |