Mercurial > repos > xuebing > sharplabtool
diff tools/regVariation/compute_motifs_frequency.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/compute_motifs_frequency.pl Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,252 @@ +#!/usr/bin/perl -w + +# a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels +# in a chromosome/genome. +# the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two +# columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif. +# the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel. +# the fourth input is an integer number representing the window size according to which the upstream and downstream sequences +# flanking each indel will be divided. +# the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking +# indels are divided. +# the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both +# upstream and downstream sequences flanking indels, one line per motif. + +use strict; +use warnings; + +#variable to handle the falnking sequences information +my $sequence = ""; +my $upstreamFlankingSequence = ""; +my $downstreamFlankingSequence = ""; +my $discardedSequenceLength = 0; +my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0; + +#variable to handle the window information +my $window = ""; +my $windowStartIndex = 0; +my $windowNumber = 0; +my $totalWindowsNumber = 0; +my $totalNumberOfWindowsInUpstreamSequence = 0; +my $totalNumberOfWindowsInDownstreamSequence = 0; +my $totalWindowsNumberInBothFlankingSequences = 0; +my $totalWindowsNumberInMotifCountersTwoDimArray = 0; +my $upstreamAndDownstreamFlankingSequencesWindows = ""; + +#variable to handle the motif information +my $motif = ""; +my $motifSequence = ""; +my $motifNumber = 0; +my $totalMotifsNumber = 0; + +#arrays to sotre window and motif data +my @windowsArray = (); +my @motifNamesArray = (); +my @motifSequencesArray = (); +my @motifCountersTwoDimArray = (); + +#variables to store line counter values +my $lineCounter1 = 0; +my $lineCounter2 = 0; + +# check to make sure having correct files +my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\n"; +die $usage unless @ARGV == 5; + +#get the input and output arguments +my $motifsInputFile = $ARGV[0]; +my $indelFlankingSequencesInputFile = $ARGV[1]; +my $windowSize = $ARGV[2]; +my $indelFlankingSequencesWindowsOutputFile = $ARGV[3]; +my $motifFrequenciesOutputFile = $ARGV[4]; + +#open the input and output files +open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); +open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \n"); +open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); +open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); + +#store the motifs input file in the array @motifsData +my @motifsData = <INPUT1>; + +#iterated through the motifs (lines) of the motifs input file +foreach $motif (@motifsData){ + chomp ($motif); + #print ($motif . "\n"); + + #split the motif data into its name and its sequence + my @motifNameAndSequenceArray = split(/\t/, $motif); + + #store the name of the motif into the array @motifNamesArray + push @motifNamesArray, $motifNameAndSequenceArray[0]; + + #store the sequence of the motif into the array @motifSequencesArray + push @motifSequencesArray, $motifNameAndSequenceArray[1]; +} + +#compute the size of the motif names array +$totalMotifsNumber = @motifNamesArray; + +#store the input file in the array @sequencesData +my @sequencesData = <INPUT2>; + +#iterated through the sequences of the second input file in order to create windwos file +foreach $sequence (@sequencesData){ + chomp ($sequence); + $lineCounter1++; + + my @indelAndSequenceArray = split(/\t/, $sequence); + + #get the upstream falnking sequence + $upstreamFlankingSequence = $indelAndSequenceArray[3]; + + #if the window size is 0, then the whole upstream will be one window only + if ($windowSize == 0){ + $totalNumberOfWindowsInUpstreamSequence = 1; + $windowSize = length ($upstreamFlankingSequence); + } + else{ + #compute the total number of windows into which the upstream flanking sequence will be divided + $totalNumberOfWindowsInUpstreamSequence = length ($upstreamFlankingSequence) / $windowSize; + + #compute the length of the subsequence to be discared from the upstream flanking sequence if any + $discardedSequenceLength = length ($upstreamFlankingSequence) % $windowSize; + + #check if the sequence could be split into windows of equal sizes + if ($discardedSequenceLength != 0) { + #trim the upstream flanking sequence + $upstreamFlankingSequence = substr($upstreamFlankingSequence, $discardedSequenceLength); + } + } + + #split the upstream flanking sequence into windows + for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInUpstreamSequence; $windowNumber++){ + $windowStartIndex = $windowNumber * $windowSize; + print OUTPUT1 (substr($upstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); + } + + #add a column representing the indel + print OUTPUT1 ("indel" . "\t"); + + #get the downstream falnking sequence + $downstreamFlankingSequence = $indelAndSequenceArray[4]; + + #if the window size is 0, then the whole upstream will be one window only + if ($windowSize == 0){ + $totalNumberOfWindowsInDownstreamSequence = 1; + $windowSize = length ($downstreamFlankingSequence); + } + else{ + #compute the total number of windows into which the downstream flanking sequence will be divided + $totalNumberOfWindowsInDownstreamSequence = length ($downstreamFlankingSequence) / $windowSize; + + #compute the length of the subsequence to be discared from the upstream flanking sequence if any + $discardedSequenceLength = length ($downstreamFlankingSequence) % $windowSize; + + #check if the sequence could be split into windows of equal sizes + if ($discardedSequenceLength != 0) { + #compute the length of the sequence to be discarded + $lengthOfDownstreamFlankingSequenceAfterTrimming = length ($downstreamFlankingSequence) - $discardedSequenceLength; + + #trim the downstream flanking sequence + $downstreamFlankingSequence = substr($downstreamFlankingSequence, 0, $lengthOfDownstreamFlankingSequenceAfterTrimming); + } + } + + #split the downstream flanking sequence into windows + for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){ + $windowStartIndex = $windowNumber * $windowSize; + print OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); + } + + print OUTPUT1 ("\n"); +} + +#compute the total number of windows on both upstream and downstream sequences flanking the indel +$totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence; + +#add an additional cell to store the name of the motif and another one for the indel itself +$totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1; + +#initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names +for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ + + for ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){ + + if ($windowNumber == 0){ + $motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber]; + } + elsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){ + $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel"; + } + else{ + $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0; + } + } +} + +close(OUTPUT1); + +#open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file +open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); + +#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData +my @windowsData = <INPUT3>; + +#iterated through the lines of the first output file. Each line represents +#the windows of the upstream and downstream flanking sequences of an indel +foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ + + chomp ($upstreamAndDownstreamFlankingSequencesWindows); + $lineCounter2++; + + #split both upstream and downstream flanking sequences into their windows + my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); + + $totalWindowsNumber = @windowsArray; + + #iterate through the windows to search for matched motifs and increment their corresponding counters accordingly + WINDOWS: + for ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){ + + #get the window + $window = $windowsArray[$windowNumber]; + + #if the window is the one that contains the indel, then skip the indel window + if ($window eq "indel") { + next WINDOWS; + } + else{ #iterated through the motif sequences to check their occurrences in the winodw + #and increment their corresponding counters accordingly + + for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ + #get the motif sequence + $motifSequence = $motifSequencesArray[$motifNumber]; + + #if the motif is found in the window, then increment its corresponding counter + if ($window =~ m/$motifSequence/i){ + $motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++; + } + } + } + } +} + +#store the motif counters values in the second output file +for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ + + for ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){ + + print OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\t"; + #print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " "); + } + print OUTPUT2 "\n"; + #print ("\n"); +} + +#close the input and output files +close(OUTPUT2); +close(OUTPUT1); +close(INPUT3); +close(INPUT2); +close(INPUT1); \ No newline at end of file