| 3 | 1 #!/usr/bin/perl | 
|  | 2 | 
|  | 3 ####### | 
|  | 4 # POD # | 
|  | 5 ####### | 
|  | 6 | 
|  | 7 =pod | 
|  | 8 | 
|  | 9 =head1 NAME | 
|  | 10 | 
|  | 11 C<binary_group_stats.pl> - categorize binary matrix rows according to | 
|  | 12 column groups | 
|  | 13 | 
|  | 14 =head1 SYNOPSIS | 
|  | 15 | 
|  | 16 C<perl binary_group_stats.pl -i binary_matrix.tsv -g group_file.tsv | 
|  | 17 -p E<gt> overall_stats.tsv> | 
|  | 18 | 
|  | 19 =head1 DESCRIPTION | 
|  | 20 | 
|  | 21 Categorize rows of a delimited TEXT input B<binary> matrix (option | 
|  | 22 B<-i>) according to column group affiliations. All fields of the binary | 
|  | 23 matrix need to be filled with either a B<0> indicating absence or a | 
|  | 24 B<1> indicating presence, i.e. all rows need to have the same number of | 
|  | 25 columns. Use option B<-d> to set the delimiter of the input matrix, | 
|  | 26 default is set to tab-delimited/separated matrices. | 
|  | 27 | 
|  | 28 The group affiliations of the columns are intended to get overall | 
|  | 29 presence/absence statistics for groups of columns and not simply | 
|  | 30 single columns of the matrix. Percentage inclusion (option | 
|  | 31 B<-cut_i>) and exclusion (option B<-cut_e>) cutoffs can be set to | 
|  | 32 define how strict the presence/absence of column groups within a row | 
|  | 33 are defined. Of course groups can also hold only single column | 
|  | 34 headers to get single column statistics. Group affiliations are | 
|  | 35 defined in a mandatory B<tab-delimited> group input file (option | 
|  | 36 B<-g>), including the column headers from the input binary matrix, | 
|  | 37 with B<minimal two> and B<maximal four> groups. | 
|  | 38 | 
|  | 39 The script was designed to handle a presence/absence query protein | 
|  | 40 binary matrix from a C<prot_finder.pl> protein homolog search in | 
|  | 41 bacterial genomes with a subsequent C<prot_binary_matrix.pl> run to | 
|  | 42 create the matrix. But, any binary matrix can be used for | 
|  | 43 C<binary_group_stats.pl>, optionally beforehand transposed with | 
|  | 44 C<transpose_matrix.pl>. However, column headers in the first row and | 
|  | 45 row headers in the first column are B<mandatory> for the input | 
|  | 46 binary matrix. Only alphanumeric (a-z, A-Z, 0-9), underscore (_), | 
|  | 47 dash (-), and period (.) characters are allowed for the B<column | 
|  | 48 headers> and B<group names> in the group file (option B<-g>) to | 
|  | 49 avoid downstream problems with the operating/file system. As a | 
|  | 50 consequence, also no whitespaces are allowed in these! Additionally, | 
|  | 51 B<column headers>, B<row headers>, and B<group names> need to be | 
|  | 52 B<unique>. | 
|  | 53 | 
|  | 54 <binary_group_stats.pl> is based upon | 
|  | 55 L<C<po2group_stats.pl>|/po2group_stats>, which does the same thing | 
|  | 56 for genomes in an ortholog/paralog output matrix from a | 
|  | 57 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/> | 
|  | 58 calculation. | 
|  | 59 | 
|  | 60 To explain the logic behind the categorization, the following | 
|  | 61 annotation for example groups will be used. A following '1' | 
|  | 62 exemplifies a group column presence count in a respective row E<gt>= | 
|  | 63 the rounded inclusion cutoff, a '0' a group column presence count | 
|  | 64 E<lt>= the rounded exclusion cutoff. The presence and absence of | 
|  | 65 rows for the column group affiliations are structured in different | 
|  | 66 categories depending on the number of groups. For B<two groups> | 
|  | 67 (e.g. A and B) there are five categories: 'A specific' (A:B = 1:0), | 
|  | 68 'B specific' (0:1), 'cutoff core' (1:1), 'underrepresented' (0:0), | 
|  | 69 and 'unspecific'. Unspecific rows have a column presence count for | 
|  | 70 at least B<one> group outside the cutoffs (exclusion cutoff E<lt> | 
|  | 71 column presence count E<lt> inclusion cutoff) and thus cannot be | 
|  | 72 categorized. The respective row headers of these 'unspecific' rows | 
|  | 73 will only be printed to a final result file with option B<-u>. | 
|  | 74 Overall stats for all categories are printed to C<STDOUT> in a final | 
|  | 75 tab-delimited output matrix. | 
|  | 76 | 
|  | 77 B<Three groups> (A, B, and C) have the following nine categories: 'A | 
|  | 78 specific' (A:B:C = 1:0:0), 'B specific' (0:1:0), 'C specific' | 
|  | 79 (0:0:1), 'A absent' (0:1:1), 'B absent' (1:0:1), 'C absent' (1:1:0), | 
|  | 80 'cutoff core' (1:1:1), 'underrepresented' (0:0:0), and 'unspecific'. | 
|  | 81 | 
|  | 82 B<Four groups> (A, B, C, and D) are classified in 17 categories: 'A | 
|  | 83 specific' (A:B:C:D = 1:0:0:0), 'B specific' (0:1:0:0), 'C specific' | 
|  | 84 (0:0:1:0), 'D specific' (0:0:0:1), 'A-B specific' (1:1:0:0), 'A-C | 
|  | 85 specific' (1:0:1:0), 'A-D specific' (1:0:0:1), 'B-C specific' | 
|  | 86 (0:1:1:0), 'B-D specific' (0:1:0:1), 'C-D specific' (0:0:1:1), 'A | 
|  | 87 absent' (0:1:1:1), 'B absent' (1:0:1:1), 'C absent' (1:1:0:1), 'D | 
|  | 88 absent' (1:1:1:0), 'cutoff core' (1:1:1:1), 'underrepresented' | 
|  | 89 (0:0:0:0), and 'unspecific'. | 
|  | 90 | 
|  | 91 The resulting B<group> presence/absence (according to the cutoffs) | 
|  | 92 can also be printed to a group binary matrix (option B<-b>) in the | 
|  | 93 result directory (option B<-r>), excluding the 'unspecific' | 
|  | 94 category. Since the categories are the logics underlying venn | 
|  | 95 diagrams, you can also plot the results in a venn diagram using the | 
|  | 96 group binary matrix (option B<-p>). The 'underrepresented' category | 
|  | 97 is exempt from the venn diagram, because it is outside of venn | 
|  | 98 diagram logics. | 
|  | 99 | 
|  | 100 For an illustration of the logics have a look at the example venn | 
|  | 101 diagrams in the L<C<po2group_stats.pl>|/po2group_stats> folder | 
|  | 102 F</po2group_stats/pics/venn_diagram_logics.[svg|png]>. | 
|  | 103 | 
|  | 104 There are two optional categories (which are only considered for the | 
|  | 105 final print outs and in the final stats matrix, not for the group | 
|  | 106 binary matrix and the venn diagram): 'strict core' (option B<-co>) | 
|  | 107 for rows where B<all> columns have a presence '1'. Of course all the | 
|  | 108 'strict core' rows are also included in the 'cutoff core' category | 
|  | 109 ('strict core' is identical to 'cutoff core' with B<-cut_i> 1 and | 
|  | 110 B<-cut_e> 0). Option B<-s> activates the detection of | 
|  | 111 'singleton/column-specific' rows with a '1' in only B<one> column. | 
|  | 112 Depending on the cutoffs and number of columns in the groups, | 
|  | 113 category 'underrepresented' includes most of these singletons. | 
|  | 114 | 
|  | 115 Each row header is printed in the respective category output file in | 
|  | 116 the result directory. The number of row headers in the category | 
|  | 117 result files are the same as listed in the venn diagram and the | 
|  | 118 final stats matrix. Groups with only a single column header will | 
|  | 119 have the same result as the respective 'singleton' category (with | 
|  | 120 option B<-s>). | 
|  | 121 | 
|  | 122 =head1 OPTIONS | 
|  | 123 | 
|  | 124 =head2 Mandatory options | 
|  | 125 | 
|  | 126 =over 20 | 
|  | 127 | 
|  | 128 =item B<-i>=I<str>, B<-input>=I<str> | 
|  | 129 | 
|  | 130 Input delimited TEXT binary matrix (e.g. *.tsv, *.csv, or *.txt), | 
|  | 131 see option B<-d> | 
|  | 132 | 
|  | 133 =item B<-g>=I<str>, B<-groups_file>=I<str> | 
|  | 134 | 
|  | 135 Tab-delimited file with group affiliation for the columns from the | 
|  | 136 input binary matrix with B<minimal two> and B<maximal four> groups | 
|  | 137 (easiest to create in a spreadsheet software and save in | 
|  | 138 tab-separated format). B<All> column headers from the input binary | 
|  | 139 matrix need to be included. Column headers and group names can only | 
|  | 140 include alphanumeric (a-z, A-Z, 0-9), underscore (_), dash (-), and | 
|  | 141 period (.) characters (no whitespaces allowed either). Example | 
|  | 142 format with two column headers in group A, three in group B and D, | 
|  | 143 and one in group C: | 
|  | 144 | 
|  | 145 group_A\tgroup_B\tgroup_C\tgroup_D | 
|  | 146 column_header1\tcolumn_header9\tcolumn_header3\tcolumn_header8 | 
|  | 147 column_header7\tcolumn_header6\t\tcolumn_header5 | 
|  | 148 \tcolumn_header4\t\tcolumn_header2 | 
|  | 149 | 
|  | 150 =back | 
|  | 151 | 
|  | 152 =head2 Optional options | 
|  | 153 | 
|  | 154 =over 20 | 
|  | 155 | 
|  | 156 =item B<-h>, B<-help> | 
|  | 157 | 
|  | 158 Help (perldoc POD) | 
|  | 159 | 
|  | 160 =item B<-d>=I<str>, B<-delimiter>=I<str> | 
|  | 161 | 
|  | 162 Set delimiter of input binary matrix (e.g. comma ',', single | 
|  | 163 space ' ' etc.) [default = tab-delimited/separated] | 
|  | 164 | 
|  | 165 =item B<-r>=I<str>, B<-result_dir>=I<str> | 
|  | 166 | 
|  | 167 Path to result folder [default = inclusion and exclusion percentage | 
|  | 168 cutoff, './results_i#_e#'] | 
|  | 169 | 
|  | 170 =item B<-cut_i>=I<float>, B<-cut_inclusion>=I<float> | 
|  | 171 | 
|  | 172 Percentage inclusion cutoff for column presence counts in a group | 
|  | 173 per row, has to be E<gt> 0 and E<lt>= 1. Cutoff will be rounded | 
|  | 174 according to the column header number in each group and has to be | 
|  | 175 E<gt> the rounded exclusion cutoff in this group. [default = 0.9] | 
|  | 176 | 
|  | 177 =item B<-cut_e>=I<float>, B<-cut_exclusion>=I<float> | 
|  | 178 | 
|  | 179 Percentage exclusion cutoff, has to be E<gt>= 0 and E<lt> 1. Rounded | 
|  | 180 cutoff has to be E<lt> rounded inclusion cutoff. [default = 0.1] | 
|  | 181 | 
|  | 182 =item B<-b>, B<-binary_group_matrix> | 
|  | 183 | 
|  | 184 Print a group binary matrix with the presence/absence column group | 
|  | 185 results according to the cutoffs (excluding 'unspecific' category | 
|  | 186 rows) | 
|  | 187 | 
|  | 188 =item B<-p>, B<-plot_venn> | 
|  | 189 | 
|  | 190 Plot venn diagram from the group binary matrix (except 'unspecific' | 
|  | 191 and 'underrepresented' categories) with function C<venn> from B<R> | 
|  | 192 package B<gplots>, requires option B<-b> | 
|  | 193 | 
|  | 194 =item B<-co>, B<-core_strict> | 
|  | 195 | 
|  | 196 Include 'strict core' category in output for rows where B<all> | 
|  | 197 columns have a '1' | 
|  | 198 | 
|  | 199 =item B<-s>, B<-singletons> | 
|  | 200 | 
|  | 201 Include singleton/column-specific rows for each column header in the | 
|  | 202 output, activates also overall column header presence ('1') counts | 
|  | 203 in final stats matrix for columns with singletons | 
|  | 204 | 
|  | 205 =item B<-u>, B<-unspecific> | 
|  | 206 | 
|  | 207 Include 'unspecific' category in output | 
|  | 208 | 
|  | 209 =item B<-a>, B<-all_column_presence_overall> | 
|  | 210 | 
|  | 211 Report overall presence counts for all column headers (appended to | 
|  | 212 the final stats matrix), also those without singletons; will include | 
|  | 213 all overall column header presence counts without option B<-s> | 
|  | 214 | 
|  | 215 =item B<-v>, B<-version> | 
|  | 216 | 
|  | 217 Print version number to C<STDERR> | 
|  | 218 | 
|  | 219 =back | 
|  | 220 | 
|  | 221 =head1 OUTPUT | 
|  | 222 | 
|  | 223 =over 20 | 
|  | 224 | 
|  | 225 =item C<STDOUT> | 
|  | 226 | 
|  | 227 The tab-delimited final stats matrix is printed to C<STDOUT>. | 
|  | 228 Redirect or pipe into another tool as needed. | 
|  | 229 | 
|  | 230 =item F<./results_i#_e#> | 
|  | 231 | 
|  | 232 All output files are stored in a results folder | 
|  | 233 | 
|  | 234 =item F<./results_i#_e#/[*_specific|*_absent|cutoff_core|underrepresented]_rows.txt> | 
|  | 235 | 
|  | 236 Files including the row headers for rows in non-optional categories | 
|  | 237 | 
|  | 238 =item (F<./results_i#_e#/[*_singletons|strict_core|unspecific]_rows.txt>) | 
|  | 239 | 
|  | 240 Optional category output files with the respective row headers | 
|  | 241 | 
|  | 242 =item (F<./results_i#_e#/binary_matrix.tsv>) | 
|  | 243 | 
|  | 244 Tab-delimited binary matrix of group presence/absence results | 
|  | 245 according to cutoffs (excluding 'unspecific' rows) | 
|  | 246 | 
|  | 247 =item (F<./results_i#_e#/venn_diagram.pdf>) | 
|  | 248 | 
|  | 249 Venn diagram for non-optional categories (except 'unspecific' and | 
|  | 250 'underrepresented' categories) | 
|  | 251 | 
|  | 252 =back | 
|  | 253 | 
|  | 254 =head1 EXAMPLES | 
|  | 255 | 
|  | 256 =over | 
|  | 257 | 
|  | 258 =item C<perl binary_group_stats.pl -i binary_matrix_transposed.csv -g group_file.tsv -d , -r result_dir -cut_i 0.7 -cut_e 0.2 -b -p -co -s -u -a E<gt> overall_stats.tsv> | 
|  | 259 | 
|  | 260 =back | 
|  | 261 | 
|  | 262 =head1 DEPENDENCIES | 
|  | 263 | 
|  | 264 =over | 
|  | 265 | 
|  | 266 =item B<Statistical computing language L<R|http://www.r-project.org/>> | 
|  | 267 | 
|  | 268 C<Rscript> is needed to plot the venn diagram with option B<-p>, | 
|  | 269 tested with version 3.2.2 | 
|  | 270 | 
|  | 271 =item B<L<gplots|https://cran.r-project.org/web/packages/gplots/index.html>> | 
|  | 272 | 
|  | 273 Package needed for B<R> to plot the venn diagram with function | 
|  | 274 C<venn>. Tested with B<gplots> version 2.17.0. | 
|  | 275 | 
|  | 276 =back | 
|  | 277 | 
|  | 278 =head1 VERSION | 
|  | 279 | 
|  | 280  0.1                                                       06-06-2016 | 
|  | 281 | 
|  | 282 =head1 AUTHOR | 
|  | 283 | 
|  | 284  Andreas Leimbach                               aleimba[at]gmx[dot]de | 
|  | 285 | 
|  | 286 =head1 LICENSE | 
|  | 287 | 
|  | 288 This program is free software: you can redistribute it and/or modify | 
|  | 289 it under the terms of the GNU General Public License as published by | 
|  | 290 the Free Software Foundation; either version 3 (GPLv3) of the | 
|  | 291 License, or (at your option) any later version. | 
|  | 292 | 
|  | 293 This program is distributed in the hope that it will be useful, but | 
|  | 294 WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 295 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 
|  | 296 General Public License for more details. | 
|  | 297 | 
|  | 298 You should have received a copy of the GNU General Public License | 
|  | 299 along with this program. If not, see L<http://www.gnu.org/licenses/>. | 
|  | 300 | 
|  | 301 =cut | 
|  | 302 | 
|  | 303 | 
|  | 304 ######## | 
|  | 305 # MAIN # | 
|  | 306 ######## | 
|  | 307 | 
|  | 308 use strict; | 
|  | 309 use warnings; | 
|  | 310 use autodie; | 
|  | 311 use Getopt::Long; | 
|  | 312 use Pod::Usage; | 
|  | 313 | 
|  | 314 my $Cmdline = "$0 @ARGV"; # used call command | 
|  | 315 | 
|  | 316 ### Get the options with Getopt::Long | 
|  | 317 my $Binary_Matrix_File; # binary input matrix | 
|  | 318 my $Groups_File; # tab-separated file with group affiliation for the input binary matrix column headers | 
|  | 319 my $Delimiter = "\t"; # set delimiter/separator of input matrix | 
|  | 320 my $Result_Dir; # path to results folder; default is set below to '"./results_i".$Inclusion_Cutoff."c".$Exclusion_Cutoff' | 
|  | 321 my $Inclusion_Cutoff = 0.9; # inclusion percentage cutoff for column presence ('1') count per row (floating point number >0 && <=1) | 
|  | 322 my $Exclusion_Cutoff = 0.1; # exclusion percentage cutoff (floating point number >=0 && <1) | 
|  | 323 my $Opt_Group_Binary; # print resulting group binary matrix according to the group inclusion/exclusion results | 
|  | 324 my $Opt_Venn; # create venn diagram with R package 'gplots' function 'venn'; requires $Opt_Group_Binary | 
|  | 325 my $Opt_Strict_Core; # report also strict core, i.e. rows with presence in ALL columns of input binary matrix (identical to 'cutoff_core' with inclusion and exclusion cutoffs of 1 and 0 resp.) | 
|  | 326 my $Opt_Singletons; # report also singletons for each column header (outside of the group classifications) | 
|  | 327 my $Opt_Unspecific; # report also group-unspecific rows ('exclusion < row_group_column-presence_count < inclusion' for any group) | 
|  | 328 my $Opt_Columns_Overall; # report overall row presence counts also for column headers without singletons | 
|  | 329 my $VERSION = 0.1; | 
|  | 330 my ($Opt_Version, $Opt_Help); | 
|  | 331 GetOptions ('input=s' => \$Binary_Matrix_File, | 
|  | 332             'groups_file=s' => \$Groups_File, | 
|  | 333             'delimiter=s' => \$Delimiter, | 
|  | 334             'result_dir=s' => \$Result_Dir, | 
|  | 335             'cut_inclusion=f' => \$Inclusion_Cutoff, | 
|  | 336             'cut_exclusion=f' => \$Exclusion_Cutoff, | 
|  | 337             'binary_group_matrix' => \$Opt_Group_Binary, | 
|  | 338             'plot_venn' => \$Opt_Venn, | 
|  | 339             'core_strict' => \$Opt_Strict_Core, | 
|  | 340             'singletons' => \$Opt_Singletons, | 
|  | 341             'unspecific' => \$Opt_Unspecific, | 
|  | 342             'all_column_presence_overall' => \$Opt_Columns_Overall, | 
|  | 343             'version' => \$Opt_Version, | 
|  | 344             'help|?' => \$Opt_Help) | 
|  | 345             or pod2usage(-verbose => 1, -exitval => 2); | 
|  | 346 | 
|  | 347 | 
|  | 348 | 
|  | 349 ### Run perldoc on POD and enforce mandatory options | 
|  | 350 pod2usage(-verbose => 2) if ($Opt_Help); | 
|  | 351 die "$0 $VERSION\n" if ($Opt_Version); | 
|  | 352 | 
|  | 353 if (!$Binary_Matrix_File || !$Groups_File) { | 
|  | 354     my $warning = "\n### Fatal error: Mandatory options '-i' or '-g' or their arguments are missing!\n"; | 
|  | 355     pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | 
|  | 356 } | 
|  | 357 | 
|  | 358 if (($Inclusion_Cutoff <= 0 || $Inclusion_Cutoff > 1) || ($Exclusion_Cutoff < 0 || $Exclusion_Cutoff >= 1)) { | 
|  | 359     my $warning = "\n### Fatal error:\nInclusion (0 < '-cut_i' <= 1) or exclusion (0 <= '-cut_e' < 1) cutoff(s) not chosen correctly!\n"; | 
|  | 360     pod2usage(-verbose => 1, -message => $warning, -exitval => 2); | 
|  | 361 } | 
|  | 362 | 
|  | 363 print STDERR "Script call command: $Cmdline\n"; # print call command after '-h|-v' | 
|  | 364 | 
|  | 365 if ($Opt_Venn && !$Opt_Group_Binary) { | 
|  | 366     warn "Option '-p' to draw the venn diagram set, but not it's required option '-b' for the output group binary matrix. Forcing option '-b'!\n"; | 
|  | 367     $Opt_Group_Binary = 1; | 
|  | 368 } | 
|  | 369 | 
|  | 370 | 
|  | 371 | 
|  | 372 ### Parse column headers (from the input binary matrix) in groups file | 
|  | 373 print STDERR "Parsing group file '$Groups_File' "; # run status of script | 
|  | 374 open (my $Groups_File_Fh, "<", "$Groups_File"); | 
|  | 375 | 
|  | 376 my @Group_Names; # store group name column order | 
|  | 377 my %Column_Header_Groups; # hash of hash with group name as key, internal hash with resp. input binary matrix column header anonymous array (key 'column_headers'), or calculated integer inclusion/exclusion cutoffs (key 'inclusion' etc.) | 
|  | 378 while (<$Groups_File_Fh>) { | 
|  | 379     chomp; | 
|  | 380     next if (/^\s*$/); # skip empty lines | 
|  | 381 | 
|  | 382     # check groups file header and get group names | 
|  | 383     if ($. == 1) { # header of groups file (first line) | 
|  | 384         die "\n### Fatal error:\nGroups file '$Groups_File' does not have a correctly formatted header line with at least TWO and maximal FOUR tab-separated group name columns (without other whitespaces)!\n" if (!/^\S+\t\S+\t?\S*\t?\S*$/); | 
|  | 385         foreach my $group_name (split(/\t/)) { | 
|  | 386             check_file_system_conformity($group_name, 'group names'); # subroutine to check string only contains alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters | 
|  | 387             die "\n### Fatal error:\nGroup name '$group_name' is not unique in the group file '$Groups_File'! Please choose unique names for the groups in the group file!\n" if (grep {/$group_name/} @Group_Names); | 
|  | 388             push (@Group_Names, $group_name); # store groups array | 
|  | 389         } | 
|  | 390         next; # skip header line for subsequent code | 
|  | 391     } | 
|  | 392 | 
|  | 393     # parse input binary matrix column headers in groups file | 
|  | 394     die "\n### Fatal error:\nLine '$.' in the groups file '$Groups_File' does not have the correct format with maximal four tab-separated column header names from the input binary matrix '$Binary_Matrix_File' according to the groups (without other whitespaces):\n$_\n" if (!/^\S*\t?\S*\t?\S*\t?\S*$/); | 
|  | 395     my $column = -1; # group file column counter to associate input binary matrix column header to correct group (@Group_Names array zero-based, thus start with '-1') | 
|  | 396     foreach my $column_header (split(/\t/)) { | 
|  | 397         $column++; | 
|  | 398         next if ($column_header =~ /^$/); # skip empty columns in group file, in case the groups have uneven column header numbers | 
|  | 399         warn "\n### Warning:\nInput binary matrix column header '$column_header' in the group file has a binary ('0' or '1') instead of an alphanumeric string, sure this is a column header? The matrix needs to have mandatory row and column headers, see help with '-h'!\n" if ($column_header =~ /^(0|1)$/); | 
|  | 400         check_file_system_conformity($column_header, 'column headers'); # subroutine | 
|  | 401         die "\n### Fatal error:\nInput binary matrix column header '$column_header' is not unique in the group file '$Groups_File', but needs to be in the group file and the input binary matrix '$Binary_Matrix_File'! The group file needs to contain all of the column headers from the input binary matrix and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the header of the matrix!\n\n" if (check_column_header_in_groups($column_header)); # subroutine to check if a column header is present in %Column_Header_Groups | 
|  | 402         push (@{ $Column_Header_Groups{$Group_Names[$column]}->{'column_headers'} }, $column_header); # push into anonymous array in hash of hash | 
|  | 403     } | 
|  | 404 } | 
|  | 405 close $Groups_File_Fh; | 
|  | 406 print STDERR "with ", scalar map(@{ $Column_Header_Groups{$_}->{'column_headers'} }, keys %Column_Header_Groups), " input binary matrix column headers ...\n"; # run status of script | 
|  | 407 | 
|  | 408 foreach (@Group_Names) { | 
|  | 409     die "\n### Fatal error:\nGroup '$_' does not contain an element, please add a suitable input binary matrix column header or remove the group from the group file '$Groups_File'!\n" if (!$Column_Header_Groups{$_}); | 
|  | 410 } | 
|  | 411 | 
|  | 412 | 
|  | 413 | 
|  | 414 ### Round inclusion and exclusion count cutoffs for each group | 
|  | 415 foreach (sort keys %Column_Header_Groups) { | 
|  | 416     $Column_Header_Groups{$_}->{'inclusion'} = int((scalar @{ $Column_Header_Groups{$_}->{'column_headers'} } * $Inclusion_Cutoff) + 0.5); # round positive number half up to integer ((s)printf rounds half to even), see e.g. https://stackoverflow.com/questions/178539/how-do-you-round-a-floating-point-number-in-perl | 
|  | 417     print STDERR "Group '$_' with ", scalar @{ $Column_Header_Groups{$_}->{'column_headers'} }, " input binary matrix column headers has a rounded inclusion cutoff of $Column_Header_Groups{$_}->{'inclusion'} column header(s) and "; | 
|  | 418 | 
|  | 419     $Column_Header_Groups{$_}->{'exclusion'} = int((scalar @{ $Column_Header_Groups{$_}->{'column_headers'} } * $Exclusion_Cutoff) + 0.5); | 
|  | 420     print STDERR "an exclusion cutoff of $Column_Header_Groups{$_}->{'exclusion'} column header(s)!\n"; | 
|  | 421     die "\n### Fatal error:\nThe rounded inclusion cutoff has to be greater than the exclusion cutoff for all groups, please choose appropriate cutoffs!\n" if ($Column_Header_Groups{$_}->{'inclusion'} <= $Column_Header_Groups{$_}->{'exclusion'}); | 
|  | 422 } | 
|  | 423 | 
|  | 424 | 
|  | 425 | 
|  | 426 ### Create result folder | 
|  | 427 if (!$Result_Dir) { # can't give default before 'GetOptions' in case cutoffs are set by the user | 
|  | 428     $Result_Dir = "./results_i".$Inclusion_Cutoff."_e".$Exclusion_Cutoff; | 
|  | 429 } else { | 
|  | 430     $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path | 
|  | 431 } | 
|  | 432 if (-e $Result_Dir) { | 
|  | 433     empty_dir($Result_Dir); # subroutine to empty a directory with user interaction | 
|  | 434 } else { | 
|  | 435     mkdir $Result_Dir; | 
|  | 436 } | 
|  | 437 | 
|  | 438 | 
|  | 439 | 
|  | 440 ### Parse rows in input binary matrix | 
|  | 441 print STDERR "Parsing input binary matrix '$Binary_Matrix_File' ...\n"; # run status of script | 
|  | 442 open (my $Binary_Matrix_Fh, "<", "$Binary_Matrix_File"); | 
|  | 443 | 
|  | 444 my @Column_Headers; # store input binary matrix column header order | 
|  | 445 my %Row_Header_Groups; # hash of hash with row header as key (first column of each row) and internal hash with column header as key and '1' for presence ('0' absence is not stored) | 
|  | 446 while (<$Binary_Matrix_Fh>) { | 
|  | 447     chomp; | 
|  | 448     next if (/^\s*$/); # skip empty lines | 
|  | 449     die "\n### Fatal error:\nSet delimiter/separator '$Delimiter' (option '-d') not found in line '$.' of the input binary matrix '$Binary_Matrix_File', sure the correct one is set?\n" if ($_ !~ /$Delimiter/); | 
|  | 450 | 
|  | 451     # check input binary matrix file header and get column header names | 
|  | 452     if ($. == 1) { # header of input binary matrix file (first line) | 
|  | 453         my (undef, @headers) = split(/$Delimiter/); # get rid of first line row header | 
|  | 454         foreach my $column_header (@headers) { | 
|  | 455             die "\n### Fatal error:\nColumn header '$column_header' is present in the input binary matrix '$Binary_Matrix_File' but not in the groups file '$Groups_File'. However, the group file needs to contain all of the column headers from the matrix and vice versa. Make sure the strings correspond exactly (also lower/uppercase) to the matrix header!\n" unless (check_column_header_in_groups($column_header)); # subroutine to check if a column header is present in %Column_Header_Groups | 
|  | 456             push (@Column_Headers, $column_header); | 
|  | 457         } | 
|  | 458         die "\n### Fatal error:\nNot the same amount of input binary matrix column headers in the group file '$Groups_File' and matrix '$Binary_Matrix_File', but the group file needs to contain all of the column headers in the matrix and vice versa!\n" if (@Column_Headers != map(@{ $Column_Header_Groups{$_}->{'column_headers'} }, keys %Column_Header_Groups)); | 
|  | 459         next; # skip header line of input binary matrix for subsequent code | 
|  | 460     } | 
|  | 461 | 
|  | 462     # parse input binary matrix | 
|  | 463     my ($row_header, @presence_absence) = split(/$Delimiter/); | 
|  | 464     die "\n### Fatal error:\nInput binary matrix '$Binary_Matrix_File' row '$row_header' doesn't have the same amount of columns as overall column headers, however the matrix needs to be filled/consistent in every row!\n" unless (@Column_Headers == @presence_absence); | 
|  | 465     die "\n### Fatal error:\nInput binary matrix row header '$row_header' is not unique in the input binary matrix '$Binary_Matrix_File', however all row headers need to be unique!\n" if ($Row_Header_Groups{$row_header}); | 
|  | 466     warn "\n### Warning:\nLine number '$.' of the binary input matrix '$Binary_Matrix_File' has a binary ('0' or '1') row header instead of an alphanumeric string, sure this is a row header? The matrix needs to have mandatory row and column headers, see help with '-h'!\n\n" if ($row_header =~ /^(0|1)$/); | 
|  | 467     my $column = -1; # column counter to associate presence/absence field to correct column header (@Column_Headers array zero-based, thus start with '-1') | 
|  | 468     foreach (@presence_absence) { | 
|  | 469         $column++; | 
|  | 470         die "\n###Fatal error:\nThere's an empty field in the input binary matrix '$Binary_Matrix_File' in row '$row_header', column '$column+2'. This shouldn't happen, as all the fields of the BINARY matrix should either have a '0' indicating absence or a '1' indicating presence!\n" if (/^$/); | 
|  | 471         die "\n### Fatal error:\nThe input binary matrix '$Binary_Matrix_File' contains a field with '$_' in row/line '$.', column '$column+2', instead of the mandatory binary '0' indicating absence or '1' indicating presence!\n" if (!/^(0|1)$/); | 
|  | 472         next if (!$_); # skip absence columns in binary input matrix (indicated by '0'), have to skip after '$column++' otherwise column won't fit | 
|  | 473         $Row_Header_Groups{$row_header}{$Column_Headers[$column]} = $_; # store presence ('1') in hash of hash | 
|  | 474     } | 
|  | 475 } | 
|  | 476 close $Binary_Matrix_Fh; | 
|  | 477 | 
|  | 478 | 
|  | 479 | 
|  | 480 ### Calculate column stats from input binary matrix according to the group cutoffs, categorize the results and print to result files | 
|  | 481 print STDERR "Calculating column stats from the input binary matrix according to the group cutoffs ...\n"; # run status of script | 
|  | 482 | 
|  | 483 # open result file for group binary matrix and print header | 
|  | 484 my $Group_Binary_Matrix_File; # filename needed also for venn diagram R script below | 
|  | 485 my $Group_Binary_Matrix_Fh; | 
|  | 486 if ($Opt_Group_Binary) { | 
|  | 487     $Group_Binary_Matrix_File = "$Result_Dir/group_binary_matrix.tsv"; | 
|  | 488     open ($Group_Binary_Matrix_Fh, ">", "$Group_Binary_Matrix_File"); | 
|  | 489     print $Group_Binary_Matrix_Fh join("\t", @Group_Names), "\n"; # print header for group binary matrix | 
|  | 490 } | 
|  | 491 | 
|  | 492 my %Count_Group_Pres_Abs; # hash of hash to store presence count and output filehandle for each category ('core', 'group-specific' etc.) | 
|  | 493 foreach my $row_header (sort {lc($a) cmp lc($b)} keys %Row_Header_Groups) { | 
|  | 494     my %present_row_columns; # hash of array to store input binary matrix columns of a group with PRESENCE ('1') in the current row for cutoff tests and print outs | 
|  | 495 | 
|  | 496     # 'strict_core' category row, if ALL columns present in row (with option '-co') | 
|  | 497     if ($Opt_Strict_Core && keys %{$Row_Header_Groups{$row_header}} == @Column_Headers) { | 
|  | 498         $Count_Group_Pres_Abs{'strict_core'}->{'presence_count'}++; | 
|  | 499         open_result_file("$Result_Dir/strict_core_rows.txt", 'strict_core') if (!$Count_Group_Pres_Abs{'strict_core'}->{'fh'}); # subroutine to open result file and store fh (if not opened yet) | 
|  | 500         print { $Count_Group_Pres_Abs{'strict_core'}->{'fh'} } "$row_header\n"; # block around fh needed if not a simple string variable (or glob) | 
|  | 501     } | 
|  | 502 | 
|  | 503     # count and save number of columns with presence ('1') for the current $row_header with their corresponding group affiliation | 
|  | 504     foreach my $group (@Group_Names) { | 
|  | 505         foreach my $column_header (@{ $Column_Header_Groups{$group}->{'column_headers'} }) { | 
|  | 506             if ($Row_Header_Groups{$row_header}->{$column_header}) { # $column_header with presence in current $row_header | 
|  | 507                 push(@{ $present_row_columns{$group} }, $column_header); # push into anonymous array in hash | 
|  | 508 | 
|  | 509                 # 'singleton' category row if only ONE column with presence (with option '-s') | 
|  | 510                 if ($Opt_Singletons && keys %{$Row_Header_Groups{$row_header}} == 1) { | 
|  | 511                     $Count_Group_Pres_Abs{$column_header}->{'presence_count'}++; # here: category = $column_header | 
|  | 512                     open_result_file("$Result_Dir/$column_header\_singletons.txt", $column_header) if (!$Count_Group_Pres_Abs{$column_header}->{'fh'}); # subroutine | 
|  | 513                     print { $Count_Group_Pres_Abs{$column_header}->{'fh'} } "$row_header\n"; | 
|  | 514                 } | 
|  | 515             } | 
|  | 516         } | 
|  | 517     } | 
|  | 518 | 
|  | 519     # filter column presence counts of the groups in this row according to inclusion-/exclusion-cutoffs | 
|  | 520     my @row_included_groups; # array for groups with number of present column in this row >= the inclusion cutoff | 
|  | 521     my @row_excluded_groups; # array for groups with column presence <= the exclusion cutoff | 
|  | 522     my @row_unspec_groups; # array for unspecific groups with 'exclusion < column_presence_count < inclusion' | 
|  | 523     foreach my $group (@Group_Names) { | 
|  | 524         if ($present_row_columns{$group}) { # only run if group has a column presence in this row (otherwise undefined ARRAY reference) | 
|  | 525             if (@{ $present_row_columns{$group} } >= $Column_Header_Groups{$group}->{'inclusion'}) { | 
|  | 526                 push(@row_included_groups, $group); | 
|  | 527             } elsif (@{ $present_row_columns{$group} } <= $Column_Header_Groups{$group}->{'exclusion'}) { | 
|  | 528                 push(@row_excluded_groups, $group); | 
|  | 529             } elsif (@{ $present_row_columns{$group} } > $Column_Header_Groups{$group}->{'exclusion'} && @{ $present_row_columns{$group} } < $Column_Header_Groups{$group}->{'inclusion'}) { | 
|  | 530                 push(@row_unspec_groups, $group); | 
|  | 531             } | 
|  | 532         } else { # $present_row_columns{$group} == 0 for group (undef anonymous array), i.e. always <= exclusion | 
|  | 533             push(@row_excluded_groups, $group); | 
|  | 534         } | 
|  | 535     } | 
|  | 536 | 
|  | 537     # 'unspecific' category row if at least ONE group present in '@row_unspec_groups' | 
|  | 538     # no further categorization, i.e. skip rest of logics | 
|  | 539     if (@row_unspec_groups) { | 
|  | 540         $Count_Group_Pres_Abs{'unspecific'}->{'presence_count'}++; | 
|  | 541         if ($Opt_Unspecific) { # print only to output file with option '-u' | 
|  | 542             open_result_file("$Result_Dir/unspecific_rows.txt", 'unspecific') if (!$Count_Group_Pres_Abs{'unspecific'}->{'fh'}); # subroutine | 
|  | 543             print { $Count_Group_Pres_Abs{'unspecific'}->{'fh'} } "$row_header\n"; | 
|  | 544         } | 
|  | 545         next; # skip to next $row_header, because now unspec and not suitable for any further categories | 
|  | 546     } | 
|  | 547 | 
|  | 548     # print group binary matrix (in contrast to rows in category 'unspecific' above, 'underrepresented' rows below not skipped because package 'venn' can handle a row with only zeros in the group binary matrix) | 
|  | 549     if ($Opt_Group_Binary) { | 
|  | 550         my $i = 0; | 
|  | 551         foreach my $group (@Group_Names) { | 
|  | 552             $i++; | 
|  | 553             print $Group_Binary_Matrix_Fh "1" if (grep {/$group/} @row_included_groups); | 
|  | 554             print $Group_Binary_Matrix_Fh "0" if (grep {/$group/} @row_excluded_groups); | 
|  | 555             print $Group_Binary_Matrix_Fh "\t" if ($i < @Group_Names); | 
|  | 556         } | 
|  | 557         print $Group_Binary_Matrix_Fh "\n"; | 
|  | 558     } | 
|  | 559 | 
|  | 560     # 'cutoff_core' category row if ALL groups >= inclusion (includes 'strict_core' rows where ALL columns are present in the row) | 
|  | 561     # 2 groups A/B [2 inclus :0 exclus], 3 groups A/B/C [3:0], 4 groups A/B/C/D [4:0] | 
|  | 562     if (@row_included_groups == @Group_Names && @row_excluded_groups == 0) { | 
|  | 563         $Count_Group_Pres_Abs{'cutoff_core'}->{'presence_count'}++; | 
|  | 564         open_result_file("$Result_Dir/cutoff_core_rows.txt", 'cutoff_core') if (!$Count_Group_Pres_Abs{'cutoff_core'}->{'fh'}); # subroutine | 
|  | 565         print { $Count_Group_Pres_Abs{'cutoff_core'}->{'fh'} } "$row_header\n"; | 
|  | 566 | 
|  | 567     # 'underrepresented' category row if ALL groups <= exclusion_cutoff (depending on the cutoffs and number of columns in the groups will include most singletons) | 
|  | 568     # 2 groups [0:2], 3 [0:3], 4 [0:4] | 
|  | 569     } elsif (@row_included_groups == 0 && @row_excluded_groups == @Group_Names) { | 
|  | 570         $Count_Group_Pres_Abs{'underrepresented'}->{'presence_count'}++; | 
|  | 571         open_result_file("$Result_Dir/underrepresented_rows.txt", 'underrepresented') if (!$Count_Group_Pres_Abs{'underrepresented'}->{'fh'}); # subroutine | 
|  | 572         print { $Count_Group_Pres_Abs{'underrepresented'}->{'fh'} } "$row_header\n"; | 
|  | 573 | 
|  | 574     # 'group-specific' category row if only ONE group >= inclusion_cutoff | 
|  | 575     # 2 groups [1:1], 3 [1:2], 4 [1:3] | 
|  | 576     } elsif (@row_included_groups == 1 && @row_excluded_groups == @Group_Names - 1) { | 
|  | 577         $Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'presence_count'}++; # only one element in array thus [0] | 
|  | 578         open_result_file("$Result_Dir/$row_included_groups[0]_specific_rows.txt", "$row_included_groups[0]_specific") if (!$Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'fh'}); # subroutine | 
|  | 579         print { $Count_Group_Pres_Abs{"$row_included_groups[0]_specific"}->{'fh'} } "$row_header\n"; | 
|  | 580 | 
|  | 581     # 'group-absent' category row if ONE group <= exclusion_cutoff (only for 3 and 4 groups) | 
|  | 582     # 3 groups (A+B not C [2:1], A+C not B, and B+C not A) and 4 groups (A+B+C not D [3:1], A+B+D not C, A+C+D not B, B+C+D not A) | 
|  | 583     } elsif (@Group_Names >= 3 && (@row_included_groups == @Group_Names - 1 && @row_excluded_groups == 1)) { | 
|  | 584         $Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'presence_count'}++; | 
|  | 585         open_result_file("$Result_Dir/$row_excluded_groups[0]_absent_rows.txt", "$row_excluded_groups[0]_absent") if (!$Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'fh'}); # subroutine | 
|  | 586         print { $Count_Group_Pres_Abs{"$row_excluded_groups[0]_absent"}->{'fh'} } "$row_header\n"; | 
|  | 587 | 
|  | 588     # 'two group-specific' category row if TWO groups >= inclusion_cutoff (only for 4 groups) | 
|  | 589     # 4 groups (B+D not A or C [2:2], B+C not A or D, A+D not B or C, A+C not B or D, A+B not C or D, C+D not A or B) | 
|  | 590     } elsif (@Group_Names == 4 && (@row_included_groups == @row_excluded_groups)) { | 
|  | 591         $Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'presence_count'}++; | 
|  | 592         open_result_file("$Result_Dir/$row_included_groups[0]-$row_included_groups[1]_specific_rows.txt", "$row_included_groups[0]-$row_included_groups[1]_specific") if (!$Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'fh'}); # subroutine | 
|  | 593         print { $Count_Group_Pres_Abs{"$row_included_groups[0]-$row_included_groups[1]_specific"}->{'fh'} } "$row_header\n"; | 
|  | 594 | 
|  | 595     } else { | 
|  | 596         die "\n### Fatal error:\nNo category logic test fits for row '$row_header', which shouldn't happen. Have you smuggled only one or five groups into the groups file '$Groups_File' (which also shouldn't be possible)? The script can handle only two, three, or four groups please edit the input!\n"; # shouldn't happen, since regex in group file parse checks for minimally two and maximally four tab-separated group names and all category logics should be covered | 
|  | 597     } | 
|  | 598 } | 
|  | 599 | 
|  | 600 | 
|  | 601 | 
|  | 602 ### Close all output files | 
|  | 603 close $Group_Binary_Matrix_Fh if ($Opt_Group_Binary); | 
|  | 604 map { close $Count_Group_Pres_Abs{$_}->{'fh'} } keys %Count_Group_Pres_Abs; | 
|  | 605 | 
|  | 606 | 
|  | 607 | 
|  | 608 ### Plot the venn diagram with R package 'gplots' function 'venn' | 
|  | 609 if ($Opt_Venn) { | 
|  | 610     my $tmp_r_script = "$Result_Dir/tmp_venn.R"; # filename of the R script | 
|  | 611     my $venn_diagram_name = "$Result_Dir/venn_diagram.pdf"; # filename of the output PDF histogram | 
|  | 612     print STDERR "Drawing venn diagram (option '-p') '$venn_diagram_name' ...\n"; # run status of script | 
|  | 613     open (my $r_fh, ">", "$tmp_r_script"); | 
|  | 614 | 
|  | 615     select $r_fh; # select fh for standard print/f output | 
|  | 616     print "#!/usr/bin/Rscript --vanilla --slave\n"; # header of R script | 
|  | 617     print "suppressMessages(library(gplots))\n"; # load library 'gplots' for function 'venn' and suppress STDOUT message from loading | 
|  | 618     print "group_binary_matrix <- read.table(\"$Group_Binary_Matrix_File\", header=TRUE, sep=\"\\t\")\n"; | 
|  | 619     print "pdf(\"$venn_diagram_name\")\n"; | 
|  | 620     print "venn(group_binary_matrix)\n"; # create the venn diagram | 
|  | 621     print "out <- dev.off()\n"; # write histogram to PDF and suppress STDOUT output by diverting it | 
|  | 622     select STDOUT; # change standard print/f output back to STDOUT | 
|  | 623     close $r_fh; | 
|  | 624 | 
|  | 625     # execute tmp R script with 'Rscript' | 
|  | 626     system("Rscript $tmp_r_script") == 0 or die "### Fatal error:\nPlotting the venn diagram didn't work. Either statistical programming language 'R' or the required R package 'gplots' is not installed, not in global \$PATH, or something wrong with the group binary matrix '$Group_Binary_Matrix_File' or the temporary R script '$tmp_r_script', which runs the drawing of the venn diagram. Function 'venn' of the 'gplots' package is required and needs to be installed!\n"; | 
|  | 627     unlink $tmp_r_script; | 
|  | 628 } | 
|  | 629 | 
|  | 630 | 
|  | 631 | 
|  | 632 ### Count overall category rows, and plausibility check for above logics | 
|  | 633 my $Total_Category_Rows = 0; | 
|  | 634 foreach my $category (keys %Count_Group_Pres_Abs) { # can't use a map in case of optionally strict_core, singletons categories (which overlap non-optional categories) | 
|  | 635     next if ($category =~ /strict_core/ || grep {$category eq $_} @Column_Headers); # skip optional 'strict_core' (option '-co') or singleton ('-s') categories | 
|  | 636 | 
|  | 637     $Total_Category_Rows += $Count_Group_Pres_Abs{$category}->{'presence_count'}; | 
|  | 638 } | 
|  | 639 die "\n### Fatal error:\nResulting categories don't include all present rows, this shouldn't happen!\n" if (keys %Row_Header_Groups != $Total_Category_Rows); # is this really always the case? | 
|  | 640 | 
|  | 641 | 
|  | 642 | 
|  | 643 ### Print final output stats matrix | 
|  | 644 print STDERR "Finished printing output files to result directory '$Result_Dir', printing final stats matrix to STDOUT ...\n"; # run status of script | 
|  | 645 print "# category\tpresence_count\n"; # header for matrix | 
|  | 646 print "total_rows\t", scalar keys %Row_Header_Groups, "\n"; | 
|  | 647 foreach my $category (sort {lc($a) cmp lc($b)} keys %Count_Group_Pres_Abs) { | 
|  | 648     print "$category"; | 
|  | 649     if (grep {$category eq $_} @Column_Headers) { # print overall presence counts for column headers with singletons (option '-s') | 
|  | 650         print " overall_presence\t", scalar grep($Row_Header_Groups{$_}->{$category} , keys %Row_Header_Groups), "\n"; | 
|  | 651         print "$category singletons"; # include category "key word" singleton for category stats below | 
|  | 652     } | 
|  | 653     print "\t$Count_Group_Pres_Abs{$category}->{'presence_count'}\n"; | 
|  | 654 } | 
|  | 655 | 
|  | 656 # print out overall presence counts for all columns even without singletons (option '-a') | 
|  | 657 if ($Opt_Columns_Overall) { | 
|  | 658     foreach my $column_header (sort {lc($a) cmp lc($b)} @Column_Headers) { | 
|  | 659         next if (grep {$column_header eq $_} keys %Count_Group_Pres_Abs); # skip column headers with singletons (option '-s'), already printed above | 
|  | 660         print "$column_header overall_presence\t", scalar grep($Row_Header_Groups{$_}->{$column_header} , keys %Row_Header_Groups), "\n"; | 
|  | 661     } | 
|  | 662 } | 
|  | 663 | 
|  | 664 exit; | 
|  | 665 | 
|  | 666 | 
|  | 667 ############# | 
|  | 668 #Subroutines# | 
|  | 669 ############# | 
|  | 670 | 
|  | 671 ### Subroutine to check if a column header is present in hash %Column_Header_Groups (hash filled from the $Groups_File) | 
|  | 672 sub check_column_header_in_groups { | 
|  | 673     my $column_header = shift; | 
|  | 674     foreach my $group (keys %Column_Header_Groups) { | 
|  | 675         return 1 if (grep {$_ eq $column_header} @{ $Column_Header_Groups{$group}->{'column_headers'} }); | 
|  | 676     } | 
|  | 677     return; # return 'undef' | 
|  | 678 } | 
|  | 679 | 
|  | 680 | 
|  | 681 | 
|  | 682 ### Subroutine to check if a string adheres to file system conformity, esp. for filenames | 
|  | 683 sub check_file_system_conformity { | 
|  | 684     my ($string, $type) = @_; | 
|  | 685     die "\n\n### Fatal error:\nOnly alphanumeric '0-9a-zA-Z', underscore '_', dash '-', and period '.' characters allowed for $type to avoid operating/file system problems. Thus, please adapt '$string' in the group file '$Groups_File' and if necessary also in the input binary matrix '$Binary_Matrix_File' accordingly!\n" if ($string !~ /^[\w_\-\.]+$/); | 
|  | 686     return 1; | 
|  | 687 } | 
|  | 688 | 
|  | 689 | 
|  | 690 | 
|  | 691 ### Subroutine to empty a directory with user interaction | 
|  | 692 sub empty_dir { | 
|  | 693     my $dir = shift; | 
|  | 694     print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? "; | 
|  | 695     my $user_ask = <STDIN>; | 
|  | 696     if ($user_ask =~ /y/i) { | 
|  | 697         unlink glob "$dir/*"; # remove all files in results directory | 
|  | 698     } else { | 
|  | 699         die "\nScript abborted!\n"; | 
|  | 700     } | 
|  | 701     return 1; | 
|  | 702 } | 
|  | 703 | 
|  | 704 | 
|  | 705 | 
|  | 706 ### Open a result file and print the header | 
|  | 707 sub open_result_file { | 
|  | 708     my ($filename, $category) = @_; | 
|  | 709     open ($Count_Group_Pres_Abs{$category}->{'fh'}, ">", "$filename"); # store filehandle in hash of hash | 
|  | 710     print { $Count_Group_Pres_Abs{$category}->{'fh'} } "# row_header\n"; # block around fh needed if not a simple string variable (or glob) | 
|  | 711     return 1; | 
|  | 712 } |