Mercurial > repos > xuebing > sharplabtool
diff tools/regVariation/compute_motif_frequencies_for_all_motifs.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/compute_motif_frequencies_for_all_motifs.pl Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,153 @@ +#!/usr/bin/perl -w + +# a program to compute the frequencies of each motif at a window size, determined by the user, in both +# upstream and downstream sequences flanking indels in all chromosomes. +# the first input is a TABULAR format file containing the motif names and sequences, such that the file +# consists of two columns: the left column represents the motif names and the right column represents +# the motif sequence, one line per motif. +# the second input is a TABULAR format file containing the windows into which both upstream and downstream +# sequences flanking indels have been divided. +# the fourth input is an integer number representing the number of windows to be considered in both +# upstream and downstream flanking sequences. +# the output is a TABULAR format file consisting of three columns: the left column represents the motif +# name, the middle column represents the motif frequency in the window of the upstream sequence flanking +# an indel, and the the right column represents the motif frequency in the window of the downstream +# sequence flanking an indel, one line per indel. +# The total number of lines in the output file = number of motifs x number of indels. + +use strict; +use warnings; + +#variable to handle the window information +my $window = ""; +my $windowNumber = 0; +my $totalWindowsNumber = 0; +my $upstreamAndDownstreamFlankingSequencesWindows = ""; + +#variable to handle the motif information +my $motif = ""; +my $motifName = ""; +my $motifSequence = ""; +my $motifNumber = 0; +my $totalMotifsNumber = 0; +my $upstreamMotifFrequencyCounter = 0; +my $downstreamMotifFrequencyCounter = 0; + +#arrays to sotre window and motif data +my @windowsArray = (); +my @motifNamesArray = (); +my @motifSequencesArray = (); + +#variable to handle the indel information +my $indelIndex = 0; + +#variable to store line counter value +my $lineCounter = 0; + +# check to make sure having correct files +my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n"; +die $usage unless @ARGV == 4; + +#get the input arguments +my $motifsInputFile = $ARGV[0]; +my $indelFlankingSequencesWindowsInputFile = $ARGV[1]; +my $numberOfConsideredWindows = $ARGV[2]; +my $motifFrequenciesOutputFile = $ARGV[3]; + +#open the input files +open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); +open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n"); +open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); + +#store the motifs input file in the array @motifsData +my @motifsData = <INPUT1>; + +#iterated through the motifs (lines) of the motifs input file +foreach $motif (@motifsData){ + chomp ($motif); + #print ($motif . "\n"); + + #split the motif data into its name and its sequence + my @motifNameAndSequenceArray = split(/\t/, $motif); + + #store the name of the motif into the array @motifNamesArray + push @motifNamesArray, $motifNameAndSequenceArray[0]; + + #store the sequence of the motif into the array @motifSequencesArray + push @motifSequencesArray, $motifNameAndSequenceArray[1]; +} + +#compute the size of the motif names array +$totalMotifsNumber = @motifNamesArray; + + +#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData +my @windowsData = <INPUT2>; + +#check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1 +if ($numberOfConsideredWindows <= 0){ + $numberOfConsideredWindows = 1; +} + +#iterated through the motif sequences to check their occurrences in the considered windows +#and store the count of their occurrences in the corresponding ouput file +for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ + + #get the motif name + $motifName = $motifNamesArray[$motifNumber]; + + #get the motif sequence + $motifSequence = $motifSequencesArray[$motifNumber]; + + #iterated through the lines of the second input file. Each line represents + #the windows of the upstream and downstream flanking sequences of an indel + foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ + + chomp ($upstreamAndDownstreamFlankingSequencesWindows); + $lineCounter++; + + #split both upstream and downstream flanking sequences into their windows + my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); + + if ($lineCounter == 1){ + $totalWindowsNumber = @windowsArray; + $indelIndex = ($totalWindowsNumber - 1)/2; + } + + #reset the motif frequency counters + $upstreamMotifFrequencyCounter = 0; + $downstreamMotifFrequencyCounter = 0; + + #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter + for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){ + + #get the window + $window = $windowsArray[$windowNumber]; + + #if the motif is found in the window, then increment its corresponding counter + if ($window =~ m/$motifSequence/i){ + $upstreamMotifFrequencyCounter++; + } + } + + #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter + for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){ + + #get the window + $window = $windowsArray[$windowNumber]; + + #if the motif is found in the window, then increment its corresponding counter + if ($window =~ m/$motifSequence/i){ + $downstreamMotifFrequencyCounter++; + } + } + + #store the result into the output file of the motif + print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n"; + } +} + +#close the input and output files +close(OUTPUT); +close(INPUT2); +close(INPUT1); \ No newline at end of file