0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 $|=1;
|
|
4 use warnings;
|
|
5 use strict;
|
|
6 #Script that takes a gff format file from step2.pl as input and compares contiguous motifs listed in the gff file.
|
|
7 #If motifs overlap and surpass the threshold, then it will remove that motif with the highest p value.
|
|
8
|
|
9 my $line;
|
|
10 my @cols;
|
|
11 my %hash;
|
|
12 my %hash_negative;
|
|
13 my $gene;
|
|
14 my @sequences;
|
|
15 my $seq_len;
|
|
16 my $OL;
|
|
17 my @output_pos;
|
|
18 my @output_neg;
|
|
19 my $actual_pvalue;
|
|
20 my $actual_pvalue_neg;
|
|
21 my $pvalue;
|
|
22 my $pvalue_neg;
|
|
23
|
|
24
|
|
25 if(@ARGV < 4){
|
|
26 print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n";
|
|
27 exit(0);
|
|
28 }
|
|
29
|
|
30
|
|
31
|
|
32 open(FIMO, "<$ARGV[0]") ||
|
|
33 die "File '$ARGV[0]' not found\n";
|
|
34 open(POSITIVE, ">$ARGV[1]") ||
|
|
35 die "File '>$ARGV[1]' not found\n";
|
|
36 open(NEGATIVE, ">$ARGV[2]") ||
|
|
37 die "File '>$ARGV[2]' not found\n";
|
|
38
|
|
39 # Getting overlap value form user and testing to see if it's 0-100 and
|
|
40 # converting to 0-1 scale.
|
|
41 if ($ARGV[3] >0.0 && $ARGV[3] <=100){
|
|
42 $OL=$ARGV[3]/100;
|
|
43 }
|
|
44 else{
|
|
45 print" ERROR: overlap is a value 0-100\n";
|
|
46 exit(0);
|
|
47 }
|
|
48 #print "OL is $OL\n";
|
|
49
|
|
50 while (<FIMO>) {
|
|
51 $line=$_; #assigning line to variable $line | $_ is a special default variable that here holds the line contents
|
|
52 chomp $line; #avoid \n on last field
|
|
53 @cols=split;#Splits the string EXPR into a list of strings and returns the list in list context, or the size of the list in scalar context.
|
|
54 #This is very useful because the data of the gff file can be called using this variable.
|
|
55 my $pos1;
|
|
56 my $pos2;
|
|
57 my $scalar;
|
|
58 my $decimal;
|
|
59 my $e;
|
|
60
|
|
61 my @list=();
|
|
62 if ($line=~/^#/){
|
|
63 printf POSITIVE"%s\n", $line;
|
|
64 printf NEGATIVE"%s\n", $line;
|
|
65 }
|
|
66 elsif ($line!~/^##/ and $cols[6]eq"+") {
|
|
67 @cols=split;
|
|
68 #$TF= substr $cols[8],5,8; #in this case we don't need that the hash considers the motif
|
|
69 $gene=substr $cols[0],0,21;
|
|
70 $pos1 = $cols[3]; #start position of the motif
|
|
71 $pos2=$cols[4]; #end position of the motif
|
|
72 @list=();
|
|
73 @list=($pos1,$pos2);
|
|
74 @sequences= split( "=", $cols[9]);
|
|
75 $seq_len = int(length (substr $sequences[1],0,-1)); #returns the length of the sequence
|
|
76 ####These variables consider the p value####
|
|
77 $decimal= substr $cols[8],-16,4;
|
|
78 $e=substr $cols[8],-11,3;
|
|
79 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string.
|
|
80 $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line
|
|
81 ####====###
|
|
82 if (not exists $hash{$gene}) { #Every time that a block of a gene with all the different motifs starts, it will register
|
|
83 #the gene in a hash: gene as a key and pos1 and pos2 as values.
|
|
84 $hash{$gene}=\@list;
|
|
85 $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop
|
|
86 push @output_pos, $line; #it saves the information of the gene motif in the array
|
|
87 }
|
|
88
|
|
89 elsif (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])
|
|
90 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) {#if the gene exists and the motif is not overlaped
|
|
91 #with the previous one
|
|
92 #then it will take the line in the list and it will
|
|
93 #consider the p value in the next loop
|
|
94 $hash{$gene}=\@list;
|
|
95 $pvalue=$actual_pvalue;
|
|
96 push @output_pos, $line;
|
|
97 }
|
|
98
|
|
99
|
|
100 elsif (
|
|
101
|
|
102 (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and
|
|
103 ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)<$OL)
|
|
104
|
|
105 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position
|
|
106 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line.
|
|
107 #It will store it in the array and its p value it will consider in the next loop.
|
|
108 $hash{$gene}=\@list;
|
|
109 $pvalue=$actual_pvalue;
|
|
110 push @output_pos, $line;
|
|
111 #print $pvalue , "\n";
|
|
112 }
|
|
113 elsif (
|
|
114
|
|
115 (not($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])and
|
|
116 ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]) and (int($pos2-(@{$hash{$gene}}[0]))/$seq_len)>$OL)
|
|
117 and $actual_pvalue<$pvalue
|
|
118
|
|
119
|
|
120 ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second
|
|
121 #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value
|
|
122 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif
|
|
123 #with the lowest p value. This p value will consider in the next loop.
|
|
124 pop @output_pos;
|
|
125 $hash{$gene}=\@list;
|
|
126 $pvalue=$actual_pvalue;
|
|
127 push @output_pos, $line;
|
|
128 #print $pvalue , "\n";
|
|
129 }
|
|
130 elsif (
|
|
131
|
|
132 ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)<$OL )
|
|
133 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1]))
|
|
134
|
|
135 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position
|
|
136 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line.
|
|
137 #It will store it in the array and its p value it will consider in the next loop.
|
|
138
|
|
139 $hash{$gene}=\@list;
|
|
140 $pvalue=$actual_pvalue;
|
|
141 push @output_pos, $line;
|
|
142 }
|
|
143 elsif (
|
|
144
|
|
145 ((($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1]) and (int((@{$hash{$gene}}[1])-$pos1)/$seq_len)>$OL )
|
|
146 and not($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue
|
|
147 #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first
|
|
148 #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value
|
|
149 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif
|
|
150 #with the lowest p value. This p value will consider in the next loop.
|
|
151 ) {
|
|
152 $hash{$gene}=\@list;
|
|
153 $pvalue=$actual_pvalue;
|
|
154 pop @output_pos;
|
|
155 push @output_pos, $line;
|
|
156 }
|
|
157 elsif (
|
|
158
|
|
159 (($pos1>=@{$hash{$gene}}[0] and $pos1<=@{$hash{$gene}}[1])
|
|
160 and ($pos2>=@{$hash{$gene}}[0] and $pos2<=@{$hash{$gene}}[1])) and $actual_pvalue<$pvalue
|
|
161
|
|
162 ) {
|
|
163 $hash{$gene}=\@list;
|
|
164 $pvalue=$actual_pvalue;
|
|
165 pop @output_pos;
|
|
166 push @output_pos, $line;
|
|
167 }
|
|
168
|
|
169
|
|
170 }
|
|
171
|
|
172 ##===========Same strategy applied to the motifs located in the minus strand===========#
|
|
173 elsif ($line!~/^##/ and $cols[6]eq"-") {
|
|
174 @cols=split;
|
|
175 #$TF= substr $cols[8],5,8;
|
|
176 $gene=substr $cols[0],0,21;
|
|
177 $pos1 = $cols[3];
|
|
178 $pos2=$cols[4];
|
|
179 @list=();
|
|
180 @list=($pos1,$pos2);
|
|
181 @sequences= split( "=", $cols[9]);
|
|
182 $seq_len = int(length (substr $sequences[1],0,-1));
|
|
183 $decimal= substr $cols[8],-16,4;
|
|
184 $e=substr $cols[8],-11,3;
|
|
185 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string.
|
|
186 $actual_pvalue_neg=$decimal*(10**$e);
|
|
187
|
|
188 if (not exists $hash_negative{$gene}) {
|
|
189 $hash_negative{$gene}=\@list;
|
|
190 $pvalue_neg=$actual_pvalue_neg;
|
|
191 push @output_neg, $line;
|
|
192 }
|
|
193
|
|
194 elsif (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1])
|
|
195 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1])) {
|
|
196 $pvalue_neg=$actual_pvalue_neg;
|
|
197 $hash_negative{$gene}=\@list;
|
|
198 push @output_neg, $line;
|
|
199 }
|
|
200
|
|
201
|
|
202 elsif (
|
|
203
|
|
204 (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1])and
|
|
205 ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)<$OL )
|
|
206 ) {
|
|
207 $pvalue_neg=$actual_pvalue_neg;
|
|
208 $hash_negative{$gene}=\@list;
|
|
209 push @output_neg, $line;
|
|
210 }
|
|
211 elsif (
|
|
212
|
|
213 (not($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and
|
|
214 ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1]) and (int($pos2-(@{$hash_negative{$gene}}[0]))/$seq_len)>$OL and
|
|
215 $actual_pvalue_neg<$pvalue_neg)
|
|
216 ) {
|
|
217 $pvalue=$actual_pvalue_neg;
|
|
218 $hash_negative{$gene}=\@list;
|
|
219 pop @output_neg;
|
|
220 push @output_neg, $line;
|
|
221 }
|
|
222 elsif (
|
|
223 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)<$OL )
|
|
224 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] ))
|
|
225 ) {
|
|
226 $pvalue_neg=$actual_pvalue_neg;
|
|
227 $hash_negative{$gene}=\@list;
|
|
228 push @output_neg, $line;
|
|
229 }
|
|
230 elsif (
|
|
231 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) and
|
|
232 (int((@{$hash_negative{$gene}}[1])-$pos1)/$seq_len)>$OL )
|
|
233 and not($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and
|
|
234 $actual_pvalue_neg<$pvalue_neg)
|
|
235 ) {
|
|
236 $pvalue_neg=$actual_pvalue_neg;
|
|
237 $hash_negative{$gene}=\@list;
|
|
238 pop @output_neg;
|
|
239 push @output_neg, $line;
|
|
240 }
|
|
241
|
|
242 elsif (
|
|
243 ((($pos1>=@{$hash_negative{$gene}}[0] and $pos1<=@{$hash_negative{$gene}}[1]) )
|
|
244 and ($pos2>=@{$hash_negative{$gene}}[0] and $pos2<=@{$hash_negative{$gene}}[1] )and
|
|
245 $actual_pvalue_neg<$pvalue_neg)
|
|
246 ) {
|
|
247 $pvalue_neg=$actual_pvalue_neg;
|
|
248 $hash_negative{$gene}=\@list;
|
|
249 pop @output_neg;
|
|
250 push @output_neg, $line;
|
|
251 }
|
|
252
|
|
253
|
|
254 }
|
|
255 }
|
|
256 foreach my $lines_pos (@output_pos){
|
|
257 printf POSITIVE"%s\n", $lines_pos;
|
|
258
|
|
259 }
|
|
260 foreach my $lines_neg (@output_neg){
|
|
261 printf NEGATIVE"%s\n", $lines_neg;
|
|
262 } |