Mercurial > repos > xuebing > sharplabtool
diff tools/regVariation/categorize_elements_satisfying_criteria.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/categorize_elements_satisfying_criteria.pl Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,172 @@ +#!/usr/bin/perl -w + +# The program takes as input a set of categories, such that each category contains many elements. +# It also takes a table relating elements with criteria, such that each element is assigned a number +# representing the number of times the element satisfies a certain criterion. +# The first input is a TABULAR format file, such that the left column represents the name of categories and, +# all other columns represent the names of elements. +# The second input is a TABULAR format file relating elements with criteria, such that the first line +# represents the names of criteria and the left column represents the names of elements. +# The output is a TABULAR format file relating catergories with criteria, such that each categoy is +# assigned a number representing the total number of times its elements satisfies a certain criterion. +# Each category is assigned as many numbers as criteria. + +use strict; +use warnings; + +#variables to handle information of the categories input file +my @categoryElementsArray = (); +my @categoriesArray = (); +my $categoryMemberNames; +my $categoryName; +my %categoryMembersHash = (); +my $memberNumber = 0; +my $totalMembersNumber = 0; +my $totalCategoriesNumber = 0; +my @categoryCountersTwoDimArray = (); +my $lineCounter1 = 0; + +#variables to handle information of the criteria and elements data input file +my $elementLine; +my @elementDataArray = (); +my $elementName; +my @criteriaArray = (); +my $criteriaNumber = 0; +my $totalCriteriaNumber = 0; +my $lineCounter2 = 0; + +#variable representing the row and column indices used to store results into a two-dimensional array +my $row = 0; +my $column = 0; + +# check to make sure having correct files +my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n"; +die $usage unless @ARGV == 3; + +#get the categories input file +my $categories_inputFile = $ARGV[0]; + +#get the criteria and data input file +my $elements_data_inputFile = $ARGV[1]; + +#get the output file +my $categorized_data_outputFile = $ARGV[2]; + +#open the input and output files +open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n"); +open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n"); +open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n"); + +#store the first input file into an array +my @categoriesData = <INPUT1>; + +#reset the value of $lineCounter1 to 0 +$lineCounter1 = 0; + +#iterate through the first input file to get the names of categories and their corresponding elements +foreach $categoryMemberNames (@categoriesData){ + chomp ($categoryMemberNames); + + @categoryElementsArray = split(/\t/, $categoryMemberNames); + + #store the name of the current category into an array + $categoriesArray [$lineCounter1] = $categoryElementsArray[0]; + + #store the name of the current category into a two-dimensional array + $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0]; + + #get the total number of elements in the current category + $totalMembersNumber = @categoryElementsArray; + + #store the names of categories and their corresponding elements into a hash + for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) { + + $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1]; + } + + $lineCounter1++; +} + +#store the second input file into an array +my @elementsData = <INPUT2>; + +#reset the value of $lineCounter2 to 0 +$lineCounter2 = 0; + +#iterate through the second input file in order to count the number of elements +#in each category that satisfy each criterion +foreach $elementLine (@elementsData){ + chomp ($elementLine); + + $lineCounter2++; + + @elementDataArray = split(/\t/, $elementLine); + + #if at the first line, get the total number of criteria and the total + #number of catergories and initialize the two-dimensional array + if ($lineCounter2 == 1){ + @criteriaArray = @elementDataArray; + $totalCriteriaNumber = @elementDataArray; + + $totalCategoriesNumber = @categoriesArray; + + #initialize the two-dimensional array + for ($row = 0; $row < $totalCategoriesNumber; $row++) { + + for ($column = 1; $column <= $totalCriteriaNumber; $column++) { + + $categoryCountersTwoDimArray [$row][$column] = 0; + } + } + } + else{ + #get the element data + $elementName = $elementDataArray[0]; + + #do the counting and store the result in the two-dimensional array + for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) { + + if ($elementDataArray[$criteriaNumber + 1] > 0){ + + $categoryName = $categoryMembersHash{$elementName}; + + my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray; + + $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1]; + } + } + } +} + +print OUTPUT "\t"; + +#store the criteria names into the output file +for ($column = 1; $column <= $totalCriteriaNumber; $column++) { + + if ($column < $totalCriteriaNumber){ + print OUTPUT $criteriaArray[$column - 1] . "\t"; + } + else{ + print OUTPUT $criteriaArray[$column - 1] . "\n"; + } +} + +#store the category names and their corresponding number of elements satisfying criteria into the output file +for ($row = 0; $row < $totalCategoriesNumber; $row++) { + + for ($column = 0; $column <= $totalCriteriaNumber; $column++) { + + if ($column < $totalCriteriaNumber){ + print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t"; + } + else{ + print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n"; + } + } +} + +#close the input and output file +close(OUTPUT); +close(INPUT2); +close(INPUT1); +