0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 # The program takes as input a set of categories, such that each category contains many elements.
|
|
4 # It also takes a table relating elements with criteria, such that each element is assigned a number
|
|
5 # representing the number of times the element satisfies a certain criterion.
|
|
6 # The first input is a TABULAR format file, such that the left column represents the name of categories and,
|
|
7 # all other columns represent the names of elements.
|
|
8 # The second input is a TABULAR format file relating elements with criteria, such that the first line
|
|
9 # represents the names of criteria and the left column represents the names of elements.
|
|
10 # The output is a TABULAR format file relating catergories with criteria, such that each categoy is
|
|
11 # assigned a number representing the total number of times its elements satisfies a certain criterion.
|
|
12 # Each category is assigned as many numbers as criteria.
|
|
13
|
|
14 use strict;
|
|
15 use warnings;
|
|
16
|
|
17 #variables to handle information of the categories input file
|
|
18 my @categoryElementsArray = ();
|
|
19 my @categoriesArray = ();
|
|
20 my $categoryMemberNames;
|
|
21 my $categoryName;
|
|
22 my %categoryMembersHash = ();
|
|
23 my $memberNumber = 0;
|
|
24 my $totalMembersNumber = 0;
|
|
25 my $totalCategoriesNumber = 0;
|
|
26 my @categoryCountersTwoDimArray = ();
|
|
27 my $lineCounter1 = 0;
|
|
28
|
|
29 #variables to handle information of the criteria and elements data input file
|
|
30 my $elementLine;
|
|
31 my @elementDataArray = ();
|
|
32 my $elementName;
|
|
33 my @criteriaArray = ();
|
|
34 my $criteriaNumber = 0;
|
|
35 my $totalCriteriaNumber = 0;
|
|
36 my $lineCounter2 = 0;
|
|
37
|
|
38 #variable representing the row and column indices used to store results into a two-dimensional array
|
|
39 my $row = 0;
|
|
40 my $column = 0;
|
|
41
|
|
42 # check to make sure having correct files
|
|
43 my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n";
|
|
44 die $usage unless @ARGV == 3;
|
|
45
|
|
46 #get the categories input file
|
|
47 my $categories_inputFile = $ARGV[0];
|
|
48
|
|
49 #get the criteria and data input file
|
|
50 my $elements_data_inputFile = $ARGV[1];
|
|
51
|
|
52 #get the output file
|
|
53 my $categorized_data_outputFile = $ARGV[2];
|
|
54
|
|
55 #open the input and output files
|
|
56 open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n");
|
|
57 open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n");
|
|
58 open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n");
|
|
59
|
|
60 #store the first input file into an array
|
|
61 my @categoriesData = <INPUT1>;
|
|
62
|
|
63 #reset the value of $lineCounter1 to 0
|
|
64 $lineCounter1 = 0;
|
|
65
|
|
66 #iterate through the first input file to get the names of categories and their corresponding elements
|
|
67 foreach $categoryMemberNames (@categoriesData){
|
|
68 chomp ($categoryMemberNames);
|
|
69
|
|
70 @categoryElementsArray = split(/\t/, $categoryMemberNames);
|
|
71
|
|
72 #store the name of the current category into an array
|
|
73 $categoriesArray [$lineCounter1] = $categoryElementsArray[0];
|
|
74
|
|
75 #store the name of the current category into a two-dimensional array
|
|
76 $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0];
|
|
77
|
|
78 #get the total number of elements in the current category
|
|
79 $totalMembersNumber = @categoryElementsArray;
|
|
80
|
|
81 #store the names of categories and their corresponding elements into a hash
|
|
82 for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) {
|
|
83
|
|
84 $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1];
|
|
85 }
|
|
86
|
|
87 $lineCounter1++;
|
|
88 }
|
|
89
|
|
90 #store the second input file into an array
|
|
91 my @elementsData = <INPUT2>;
|
|
92
|
|
93 #reset the value of $lineCounter2 to 0
|
|
94 $lineCounter2 = 0;
|
|
95
|
|
96 #iterate through the second input file in order to count the number of elements
|
|
97 #in each category that satisfy each criterion
|
|
98 foreach $elementLine (@elementsData){
|
|
99 chomp ($elementLine);
|
|
100
|
|
101 $lineCounter2++;
|
|
102
|
|
103 @elementDataArray = split(/\t/, $elementLine);
|
|
104
|
|
105 #if at the first line, get the total number of criteria and the total
|
|
106 #number of catergories and initialize the two-dimensional array
|
|
107 if ($lineCounter2 == 1){
|
|
108 @criteriaArray = @elementDataArray;
|
|
109 $totalCriteriaNumber = @elementDataArray;
|
|
110
|
|
111 $totalCategoriesNumber = @categoriesArray;
|
|
112
|
|
113 #initialize the two-dimensional array
|
|
114 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
|
|
115
|
|
116 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
|
|
117
|
|
118 $categoryCountersTwoDimArray [$row][$column] = 0;
|
|
119 }
|
|
120 }
|
|
121 }
|
|
122 else{
|
|
123 #get the element data
|
|
124 $elementName = $elementDataArray[0];
|
|
125
|
|
126 #do the counting and store the result in the two-dimensional array
|
|
127 for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) {
|
|
128
|
|
129 if ($elementDataArray[$criteriaNumber + 1] > 0){
|
|
130
|
|
131 $categoryName = $categoryMembersHash{$elementName};
|
|
132
|
|
133 my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray;
|
|
134
|
|
135 $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1];
|
|
136 }
|
|
137 }
|
|
138 }
|
|
139 }
|
|
140
|
|
141 print OUTPUT "\t";
|
|
142
|
|
143 #store the criteria names into the output file
|
|
144 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
|
|
145
|
|
146 if ($column < $totalCriteriaNumber){
|
|
147 print OUTPUT $criteriaArray[$column - 1] . "\t";
|
|
148 }
|
|
149 else{
|
|
150 print OUTPUT $criteriaArray[$column - 1] . "\n";
|
|
151 }
|
|
152 }
|
|
153
|
|
154 #store the category names and their corresponding number of elements satisfying criteria into the output file
|
|
155 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
|
|
156
|
|
157 for ($column = 0; $column <= $totalCriteriaNumber; $column++) {
|
|
158
|
|
159 if ($column < $totalCriteriaNumber){
|
|
160 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t";
|
|
161 }
|
|
162 else{
|
|
163 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n";
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167
|
|
168 #close the input and output file
|
|
169 close(OUTPUT);
|
|
170 close(INPUT2);
|
|
171 close(INPUT1);
|
|
172
|