0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 # a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels
|
|
4 # in a chromosome/genome.
|
|
5 # the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two
|
|
6 # columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif.
|
|
7 # the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel.
|
|
8 # the fourth input is an integer number representing the window size according to which the upstream and downstream sequences
|
|
9 # flanking each indel will be divided.
|
|
10 # the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking
|
|
11 # indels are divided.
|
|
12 # the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both
|
|
13 # upstream and downstream sequences flanking indels, one line per motif.
|
|
14
|
|
15 use strict;
|
|
16 use warnings;
|
|
17
|
|
18 #variable to handle the falnking sequences information
|
|
19 my $sequence = "";
|
|
20 my $upstreamFlankingSequence = "";
|
|
21 my $downstreamFlankingSequence = "";
|
|
22 my $discardedSequenceLength = 0;
|
|
23 my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0;
|
|
24
|
|
25 #variable to handle the window information
|
|
26 my $window = "";
|
|
27 my $windowStartIndex = 0;
|
|
28 my $windowNumber = 0;
|
|
29 my $totalWindowsNumber = 0;
|
|
30 my $totalNumberOfWindowsInUpstreamSequence = 0;
|
|
31 my $totalNumberOfWindowsInDownstreamSequence = 0;
|
|
32 my $totalWindowsNumberInBothFlankingSequences = 0;
|
|
33 my $totalWindowsNumberInMotifCountersTwoDimArray = 0;
|
|
34 my $upstreamAndDownstreamFlankingSequencesWindows = "";
|
|
35
|
|
36 #variable to handle the motif information
|
|
37 my $motif = "";
|
|
38 my $motifSequence = "";
|
|
39 my $motifNumber = 0;
|
|
40 my $totalMotifsNumber = 0;
|
|
41
|
|
42 #arrays to sotre window and motif data
|
|
43 my @windowsArray = ();
|
|
44 my @motifNamesArray = ();
|
|
45 my @motifSequencesArray = ();
|
|
46 my @motifCountersTwoDimArray = ();
|
|
47
|
|
48 #variables to store line counter values
|
|
49 my $lineCounter1 = 0;
|
|
50 my $lineCounter2 = 0;
|
|
51
|
|
52 # check to make sure having correct files
|
|
53 my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\n";
|
|
54 die $usage unless @ARGV == 5;
|
|
55
|
|
56 #get the input and output arguments
|
|
57 my $motifsInputFile = $ARGV[0];
|
|
58 my $indelFlankingSequencesInputFile = $ARGV[1];
|
|
59 my $windowSize = $ARGV[2];
|
|
60 my $indelFlankingSequencesWindowsOutputFile = $ARGV[3];
|
|
61 my $motifFrequenciesOutputFile = $ARGV[4];
|
|
62
|
|
63 #open the input and output files
|
|
64 open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n");
|
|
65 open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \n");
|
|
66 open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n");
|
|
67 open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");
|
|
68
|
|
69 #store the motifs input file in the array @motifsData
|
|
70 my @motifsData = <INPUT1>;
|
|
71
|
|
72 #iterated through the motifs (lines) of the motifs input file
|
|
73 foreach $motif (@motifsData){
|
|
74 chomp ($motif);
|
|
75 #print ($motif . "\n");
|
|
76
|
|
77 #split the motif data into its name and its sequence
|
|
78 my @motifNameAndSequenceArray = split(/\t/, $motif);
|
|
79
|
|
80 #store the name of the motif into the array @motifNamesArray
|
|
81 push @motifNamesArray, $motifNameAndSequenceArray[0];
|
|
82
|
|
83 #store the sequence of the motif into the array @motifSequencesArray
|
|
84 push @motifSequencesArray, $motifNameAndSequenceArray[1];
|
|
85 }
|
|
86
|
|
87 #compute the size of the motif names array
|
|
88 $totalMotifsNumber = @motifNamesArray;
|
|
89
|
|
90 #store the input file in the array @sequencesData
|
|
91 my @sequencesData = <INPUT2>;
|
|
92
|
|
93 #iterated through the sequences of the second input file in order to create windwos file
|
|
94 foreach $sequence (@sequencesData){
|
|
95 chomp ($sequence);
|
|
96 $lineCounter1++;
|
|
97
|
|
98 my @indelAndSequenceArray = split(/\t/, $sequence);
|
|
99
|
|
100 #get the upstream falnking sequence
|
|
101 $upstreamFlankingSequence = $indelAndSequenceArray[3];
|
|
102
|
|
103 #if the window size is 0, then the whole upstream will be one window only
|
|
104 if ($windowSize == 0){
|
|
105 $totalNumberOfWindowsInUpstreamSequence = 1;
|
|
106 $windowSize = length ($upstreamFlankingSequence);
|
|
107 }
|
|
108 else{
|
|
109 #compute the total number of windows into which the upstream flanking sequence will be divided
|
|
110 $totalNumberOfWindowsInUpstreamSequence = length ($upstreamFlankingSequence) / $windowSize;
|
|
111
|
|
112 #compute the length of the subsequence to be discared from the upstream flanking sequence if any
|
|
113 $discardedSequenceLength = length ($upstreamFlankingSequence) % $windowSize;
|
|
114
|
|
115 #check if the sequence could be split into windows of equal sizes
|
|
116 if ($discardedSequenceLength != 0) {
|
|
117 #trim the upstream flanking sequence
|
|
118 $upstreamFlankingSequence = substr($upstreamFlankingSequence, $discardedSequenceLength);
|
|
119 }
|
|
120 }
|
|
121
|
|
122 #split the upstream flanking sequence into windows
|
|
123 for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInUpstreamSequence; $windowNumber++){
|
|
124 $windowStartIndex = $windowNumber * $windowSize;
|
|
125 print OUTPUT1 (substr($upstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t");
|
|
126 }
|
|
127
|
|
128 #add a column representing the indel
|
|
129 print OUTPUT1 ("indel" . "\t");
|
|
130
|
|
131 #get the downstream falnking sequence
|
|
132 $downstreamFlankingSequence = $indelAndSequenceArray[4];
|
|
133
|
|
134 #if the window size is 0, then the whole upstream will be one window only
|
|
135 if ($windowSize == 0){
|
|
136 $totalNumberOfWindowsInDownstreamSequence = 1;
|
|
137 $windowSize = length ($downstreamFlankingSequence);
|
|
138 }
|
|
139 else{
|
|
140 #compute the total number of windows into which the downstream flanking sequence will be divided
|
|
141 $totalNumberOfWindowsInDownstreamSequence = length ($downstreamFlankingSequence) / $windowSize;
|
|
142
|
|
143 #compute the length of the subsequence to be discared from the upstream flanking sequence if any
|
|
144 $discardedSequenceLength = length ($downstreamFlankingSequence) % $windowSize;
|
|
145
|
|
146 #check if the sequence could be split into windows of equal sizes
|
|
147 if ($discardedSequenceLength != 0) {
|
|
148 #compute the length of the sequence to be discarded
|
|
149 $lengthOfDownstreamFlankingSequenceAfterTrimming = length ($downstreamFlankingSequence) - $discardedSequenceLength;
|
|
150
|
|
151 #trim the downstream flanking sequence
|
|
152 $downstreamFlankingSequence = substr($downstreamFlankingSequence, 0, $lengthOfDownstreamFlankingSequenceAfterTrimming);
|
|
153 }
|
|
154 }
|
|
155
|
|
156 #split the downstream flanking sequence into windows
|
|
157 for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){
|
|
158 $windowStartIndex = $windowNumber * $windowSize;
|
|
159 print OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t");
|
|
160 }
|
|
161
|
|
162 print OUTPUT1 ("\n");
|
|
163 }
|
|
164
|
|
165 #compute the total number of windows on both upstream and downstream sequences flanking the indel
|
|
166 $totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence;
|
|
167
|
|
168 #add an additional cell to store the name of the motif and another one for the indel itself
|
|
169 $totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1;
|
|
170
|
|
171 #initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names
|
|
172 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
|
|
173
|
|
174 for ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){
|
|
175
|
|
176 if ($windowNumber == 0){
|
|
177 $motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber];
|
|
178 }
|
|
179 elsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){
|
|
180 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel";
|
|
181 }
|
|
182 else{
|
|
183 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0;
|
|
184 }
|
|
185 }
|
|
186 }
|
|
187
|
|
188 close(OUTPUT1);
|
|
189
|
|
190 #open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file
|
|
191 open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n");
|
|
192
|
|
193 #store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
|
|
194 my @windowsData = <INPUT3>;
|
|
195
|
|
196 #iterated through the lines of the first output file. Each line represents
|
|
197 #the windows of the upstream and downstream flanking sequences of an indel
|
|
198 foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
|
|
199
|
|
200 chomp ($upstreamAndDownstreamFlankingSequencesWindows);
|
|
201 $lineCounter2++;
|
|
202
|
|
203 #split both upstream and downstream flanking sequences into their windows
|
|
204 my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
|
|
205
|
|
206 $totalWindowsNumber = @windowsArray;
|
|
207
|
|
208 #iterate through the windows to search for matched motifs and increment their corresponding counters accordingly
|
|
209 WINDOWS:
|
|
210 for ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){
|
|
211
|
|
212 #get the window
|
|
213 $window = $windowsArray[$windowNumber];
|
|
214
|
|
215 #if the window is the one that contains the indel, then skip the indel window
|
|
216 if ($window eq "indel") {
|
|
217 next WINDOWS;
|
|
218 }
|
|
219 else{ #iterated through the motif sequences to check their occurrences in the winodw
|
|
220 #and increment their corresponding counters accordingly
|
|
221
|
|
222 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
|
|
223 #get the motif sequence
|
|
224 $motifSequence = $motifSequencesArray[$motifNumber];
|
|
225
|
|
226 #if the motif is found in the window, then increment its corresponding counter
|
|
227 if ($window =~ m/$motifSequence/i){
|
|
228 $motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++;
|
|
229 }
|
|
230 }
|
|
231 }
|
|
232 }
|
|
233 }
|
|
234
|
|
235 #store the motif counters values in the second output file
|
|
236 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
|
|
237
|
|
238 for ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){
|
|
239
|
|
240 print OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\t";
|
|
241 #print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " ");
|
|
242 }
|
|
243 print OUTPUT2 "\n";
|
|
244 #print ("\n");
|
|
245 }
|
|
246
|
|
247 #close the input and output files
|
|
248 close(OUTPUT2);
|
|
249 close(OUTPUT1);
|
|
250 close(INPUT3);
|
|
251 close(INPUT2);
|
|
252 close(INPUT1); |