annotate tools/regVariation/categorize_elements_satisfying_criteria.pl @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/perl -w
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # The program takes as input a set of categories, such that each category contains many elements.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # It also takes a table relating elements with criteria, such that each element is assigned a number
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # representing the number of times the element satisfies a certain criterion.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # The first input is a TABULAR format file, such that the left column represents the name of categories and,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # all other columns represent the names of elements.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # The second input is a TABULAR format file relating elements with criteria, such that the first line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # represents the names of criteria and the left column represents the names of elements.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # The output is a TABULAR format file relating catergories with criteria, such that each categoy is
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 # assigned a number representing the total number of times its elements satisfies a certain criterion.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 # Each category is assigned as many numbers as criteria.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 use strict;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 use warnings;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 #variables to handle information of the categories input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 my @categoryElementsArray = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 my @categoriesArray = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 my $categoryMemberNames;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 my $categoryName;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 my %categoryMembersHash = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 my $memberNumber = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 my $totalMembersNumber = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 my $totalCategoriesNumber = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 my @categoryCountersTwoDimArray = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 my $lineCounter1 = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 #variables to handle information of the criteria and elements data input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 my $elementLine;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 my @elementDataArray = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 my $elementName;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 my @criteriaArray = ();
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 my $criteriaNumber = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 my $totalCriteriaNumber = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 my $lineCounter2 = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 #variable representing the row and column indices used to store results into a two-dimensional array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 my $row = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 my $column = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 # check to make sure having correct files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 die $usage unless @ARGV == 3;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 #get the categories input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 my $categories_inputFile = $ARGV[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 #get the criteria and data input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 my $elements_data_inputFile = $ARGV[1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 #get the output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 my $categorized_data_outputFile = $ARGV[2];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 #open the input and output files
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n");
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 #store the first input file into an array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 my @categoriesData = <INPUT1>;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 #reset the value of $lineCounter1 to 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 $lineCounter1 = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 #iterate through the first input file to get the names of categories and their corresponding elements
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 foreach $categoryMemberNames (@categoriesData){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 chomp ($categoryMemberNames);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 @categoryElementsArray = split(/\t/, $categoryMemberNames);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 #store the name of the current category into an array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 $categoriesArray [$lineCounter1] = $categoryElementsArray[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 #store the name of the current category into a two-dimensional array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 #get the total number of elements in the current category
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 $totalMembersNumber = @categoryElementsArray;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 #store the names of categories and their corresponding elements into a hash
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 $lineCounter1++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 #store the second input file into an array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 my @elementsData = <INPUT2>;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 #reset the value of $lineCounter2 to 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 $lineCounter2 = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 #iterate through the second input file in order to count the number of elements
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 #in each category that satisfy each criterion
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 foreach $elementLine (@elementsData){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 chomp ($elementLine);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 $lineCounter2++;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 @elementDataArray = split(/\t/, $elementLine);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 #if at the first line, get the total number of criteria and the total
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 #number of catergories and initialize the two-dimensional array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 if ($lineCounter2 == 1){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 @criteriaArray = @elementDataArray;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 $totalCriteriaNumber = @elementDataArray;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 $totalCategoriesNumber = @categoriesArray;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 #initialize the two-dimensional array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 $categoryCountersTwoDimArray [$row][$column] = 0;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 else{
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 #get the element data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 $elementName = $elementDataArray[0];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 #do the counting and store the result in the two-dimensional array
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 if ($elementDataArray[$criteriaNumber + 1] > 0){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 $categoryName = $categoryMembersHash{$elementName};
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray;
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1];
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 print OUTPUT "\t";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 #store the criteria names into the output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 if ($column < $totalCriteriaNumber){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 print OUTPUT $criteriaArray[$column - 1] . "\t";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 else{
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 print OUTPUT $criteriaArray[$column - 1] . "\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 #store the category names and their corresponding number of elements satisfying criteria into the output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 for ($column = 0; $column <= $totalCriteriaNumber; $column++) {
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 if ($column < $totalCriteriaNumber){
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 else{
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n";
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 }
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 #close the input and output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 close(OUTPUT);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 close(INPUT2);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 close(INPUT1);
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172