comparison Tools/First_version/rm_overlap_motifs_galaxy.pl @ 3:b30ba2b06326 draft

Uploaded
author amadeo
date Mon, 05 Sep 2016 06:01:48 -0400
parents 229d36377838
children
comparison
equal deleted inserted replaced
2:607d7bdfb07f 3:b30ba2b06326
1 #!/usr/bin/perl -w
2
3 $|=1;
4 use warnings;
5 use strict;
6
7
8 my $line;
9 my @cols;
10 my %hash;
11 my %hash_negative;
12 my $gene;
13 my $TF;
14 my @sequences;
15 my $seq_len;
16 my $OL;
17 my @output_pos;
18 my @output_neg;
19 my $actual_pvalue;
20 my $pvalue;
21 my $pvalue_neg;
22
23
24 #$ARGV[3]=<STDIN>;
25
26 if(@ARGV < 4){
27 print "\nUsage: rm_overlap_motifs_posneg.pl fimo-test-sue.gff fimo-nol-pos.gff fimo-nol-neg.gff overlap_percentage\n\n";
28 exit(0);
29 }
30
31
32
33 open(FIMO, "<$ARGV[0]") ||
34 die "File '$ARGV[0]' not found\n";
35 open(POSITIVE, ">$ARGV[1]") ||
36 die "File '>$ARGV[1]' not found\n";
37 open(NEGATIVE, ">$ARGV[2]") ||
38 die "File '>$ARGV[2]' not found\n";
39
40 # Getting overlap value form user and testing to see if its 0-100 and
41 # converting to 0-1 scale.
42 if ($ARGV[3] >0.0 && $ARGV[3] <=100){
43 $OL=$ARGV[3]/100;
44 }
45 else{
46 print" ERROR: overlap is a value 0-100\n";
47 exit(0);
48 }
49 #print "OL is $OL\n";
50
51 while (<FIMO>) {
52 $line=$_;
53 chomp $line;
54 @cols=split;
55 my $pos1;
56 my $pos2;
57 my $scalar;
58 my $decimal;
59 my $e;
60
61 my @list=();
62 if ($line=~/^#/){
63 printf POSITIVE"%s\n", $line;
64 printf NEGATIVE"%s\n", $line;
65 }
66 elsif ($line!~/^##/ and $cols[6]eq"+") {
67 @cols=split;
68 $TF= substr $cols[8],5,8;
69 $gene=substr $cols[0],0,21;
70 $pos1 = $cols[3];
71 $pos2=$cols[4];
72 @list=();
73 @list=($pos1,$pos2);
74 @sequences= split( "=", $cols[9]);
75 $seq_len = int(length (substr $sequences[1],0,-1));
76 $decimal= substr $cols[8],-16,4;
77 $e=substr $cols[8],-11,3;
78 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string.
79 $actual_pvalue=$decimal*(10**$e); #it will take the p value of the current line
80
81 if (not exists $hash{$gene}{$TF}) { #Every time that a block of a GENE-MOTIF starts, it will register
82 #the GENE-MOTIF in a hash: GENE-MOTIF as a key and pos1 and pos2 as values.
83 $hash{$gene}{$TF}=\@list;
84 $pvalue=$actual_pvalue; #p value of the current line that it will be compared in the next loop
85 push @output_pos, $line; #it saves the information of the gene motif in the array
86 }
87
88 elsif (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])
89 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) {#if the gene exists and the motif is not overlaped
90 #with the previous one
91 #then it will take the line in the list and it will
92 #consider the p value in the next loop
93 $hash{$gene}{$TF}=\@list;
94 $pvalue=$actual_pvalue;
95 push @output_pos, $line;
96 }
97
98
99 elsif (
100
101 (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and
102 ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)<$OL)
103
104 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the second position
105 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line.
106 #It will store it in the array and its p value it will consider in the next loop.
107 $hash{$gene}{$TF}=\@list;
108 $pvalue=$actual_pvalue;
109 push @output_pos, $line;
110 #print $pvalue , "\n";
111 }
112 elsif (
113
114 (not($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1])and
115 ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]) and (int($pos2-(@{$hash{$gene}{$TF}}[0]))/$seq_len)>$OL)
116 and $actual_pvalue<$pvalue
117
118
119 ) { #If the actual motif overlaps with the previous motif and the overlaping sequence includes the second
120 #position and not the first one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value
121 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif
122 #with the lowest p value. This p value will consider in the next loop.
123 pop @output_pos;
124 $hash{$gene}{$TF}=\@list;
125 $pvalue=$actual_pvalue;
126 push @output_pos, $line;
127 #print $pvalue , "\n";
128 }
129 elsif (
130
131 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL )
132 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1]))
133
134 ) {#If the actual motif overlaps with the previous motif and the overlaping sequence includes the first position
135 #position and not the first one of the actual motif AND it doesn't surpass the threshold $OL then it will consider the line.
136 #It will store it in the array and its p value it will consider in the next loop.
137
138 $hash{$gene}{$TF}=\@list;
139 $pvalue=$actual_pvalue;
140 push @output_pos, $line;
141 }
142 elsif (
143
144 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) and (int((@{$hash{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL )
145 and not($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue
146 #If the actual motif overlaps with the previous motif and the overlaping sequence includes the first
147 #position and not the second one of the actual motif AND it DOES surpass the threshold $OL but the actual motif has a lower p value
148 #than the last considered;then it will consider the line and it will remove the previous motif from the array; considering the motif
149 #with the lowest p value. This p value will consider in the next loop.
150 ) {
151 $hash{$gene}{$TF}=\@list;
152 $pvalue=$actual_pvalue;
153 pop @output_pos;
154 push @output_pos, $line;
155 }
156 elsif (
157
158 ((($pos1>=@{$hash{$gene}{$TF}}[0] and $pos1<=@{$hash{$gene}{$TF}}[1]) )
159 and ($pos2>=@{$hash{$gene}{$TF}}[0] and $pos2<=@{$hash{$gene}{$TF}}[1])) and $actual_pvalue<$pvalue
160
161 ) {
162 $hash{$gene}{$TF}=\@list;
163 $pvalue=$actual_pvalue;
164 pop @output_pos;
165 push @output_pos, $line;
166 }
167
168
169 }
170 elsif ($line!~/^##/ and $cols[6]eq"-") { #same strategy for the motifs located in the minus strand
171 @cols=split;
172 #$TF= substr $cols[8],5,8;
173 $gene=substr $cols[0],0,21;
174 $pos1 = $cols[3];
175 $pos2=$cols[4];
176 @list=();
177 @list=($pos1,$pos2);
178 @sequences= split( "=", $cols[9]);
179 $seq_len = int(length (substr $sequences[1],0,-1));
180 $decimal= substr $cols[8],-16,4;
181 $e=substr $cols[8],-11,3;
182 $decimal =~ s/[^.\d]//g; #This removes all nondigit characters from the string.
183 $actual_pvalue=$decimal*(10**$e);
184
185 if (not exists $hash_negative{$gene}{$TF}) {
186 $hash_negative{$gene}{$TF}=\@list;
187 $pvalue_neg=$actual_pvalue;
188 push @output_neg, $line;
189 }
190
191 elsif (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])
192 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1])) {
193 $pvalue_neg=$actual_pvalue;
194 $hash_negative{$gene}{$TF}=\@list;
195 push @output_neg, $line;
196 }
197
198
199 elsif (
200
201 (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1])and
202 ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)<$OL )
203 ) {
204 $pvalue_neg=$actual_pvalue;
205 $hash_negative{$gene}{$TF}=\@list;
206 push @output_neg, $line;
207 }
208 elsif (
209
210 (not($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and
211 ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1]) and (int($pos2-(@{$hash_negative{$gene}{$TF}}[0]))/$seq_len)>$OL and
212 $actual_pvalue<$pvalue_neg)
213 ) {
214 $pvalue=$actual_pvalue;
215 $hash_negative{$gene}{$TF}=\@list;
216 pop @output_neg;
217 push @output_neg, $line;
218 }
219 elsif (
220 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)<$OL )
221 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] ))
222 ) {
223 $pvalue_neg=$actual_pvalue;
224 $hash_negative{$gene}{$TF}=\@list;
225 push @output_neg, $line;
226 }
227 elsif (
228 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]) and
229 (int((@{$hash_negative{$gene}{$TF}}[1])-$pos1)/$seq_len)>$OL )
230 and not($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and
231 $actual_pvalue<$pvalue_neg)
232 ) {
233 $pvalue_neg=$actual_pvalue;
234 $hash_negative{$gene}{$TF}=\@list;
235 pop @output_neg;
236 push @output_neg, $line;
237 }
238 elsif (
239 ((($pos1>=@{$hash_negative{$gene}{$TF}}[0] and $pos1<=@{$hash_negative{$gene}{$TF}}[1]))
240 and ($pos2>=@{$hash_negative{$gene}{$TF}}[0] and $pos2<=@{$hash_negative{$gene}{$TF}}[1] )and
241 $actual_pvalue<$pvalue_neg)
242 ) {
243 $pvalue_neg=$actual_pvalue;
244 $hash_negative{$gene}{$TF}=\@list;
245 pop @output_neg;
246 push @output_neg, $line;
247 }
248
249
250 }
251 }
252 foreach my $lines_pos (@output_pos){
253 printf POSITIVE"%s\n", $lines_pos;
254
255 }
256 foreach my $lines_neg (@output_neg){
257 printf NEGATIVE"%s\n", $lines_neg;
258 }