| 
3
 | 
     1 #!/usr/bin/perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 #######
 | 
| 
 | 
     4 # POD #
 | 
| 
 | 
     5 #######
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 =pod
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 =head1 NAME
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 C<tbl2tab.pl> - convert tbl to tab-separated format and back
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 =head1 SYNOPSIS
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 C<perl tbl2tab.pl -m tbl2tab -i feature_table.tbl -s -l locus_prefix>
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 B<or>
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 C<perl tbl2tab.pl -m tab2tbl -i feature_table.tab -g -l locus_prefix
 | 
| 
 | 
    20 -p "gnl|dbname|">
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 =head1 DESCRIPTION
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 NCBI's feature table (B<tbl>) format is needed for the submission of
 | 
| 
 | 
    25 genomic data to GenBank with the NCBI tools
 | 
| 
 | 
    26 L<Sequin|http://www.ncbi.nlm.nih.gov/Sequin/> or
 | 
| 
 | 
    27 L<tbl2asn|http://www.ncbi.nlm.nih.gov/genbank/tbl2asn2>. tbl files
 | 
| 
 | 
    28 can be created with automatic annotation systems like
 | 
| 
 | 
    29 L<Prokka|http://www.vicbioinformatics.com/software.prokka.shtml>.
 | 
| 
 | 
    30 C<tbl2tab.pl> can convert a tbl file to a tab-separated format (tab)
 | 
| 
 | 
    31 and back to the tbl format. The tab-delimited format is useful to
 | 
| 
 | 
    32 manipulate the data more comfortably in a spreadsheet software (e.g.
 | 
| 
 | 
    33 LibreOffice or MS Excel). For a conversion back to tbl format save
 | 
| 
 | 
    34 the file in the spreadsheet software as a tab-delimited text file.
 | 
| 
 | 
    35 The script is intended for microbial genomes, but might also be
 | 
| 
 | 
    36 useful for eukaryotes.
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 Regular expressions are applied in mode B<tbl2tab> to correct gene
 | 
| 
 | 
    39 names and words in '/product' values  to lowercase initials (with
 | 
| 
 | 
    40 the exception of 'Rossman' and 'Willebrand'). The resulting tab file
 | 
| 
 | 
    41 can then be used to check for possible errors.
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 The first four header columns of the B<tab> format are mandatory,
 | 
| 
 | 
    44 'seq_id' for the SeqID, and for each primary tag/feature (e.g. CDS,
 | 
| 
 | 
    45 RNAs, repeat_region etc.), 'start', 'stop', and 'primary_tag'. These
 | 
| 
 | 
    46 mandatory columns have to be filled in every row in the tab file.
 | 
| 
 | 
    47 All the following columns will be included as tags/qualifiers (e.g.
 | 
| 
 | 
    48 '/locus_tag', '/product', '/EC_number', '/note' etc.) in the
 | 
| 
 | 
    49 conversion to the tbl file if a value is present.
 | 
| 
 | 
    50 
 | 
| 
 | 
    51 There are three special cases:
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 B<First>, '/pseudo' will be included as a tag if I<any> value (the
 | 
| 
 | 
    54 script uses 'T' for true) is present in the B<tab> format. If a
 | 
| 
 | 
    55 primary tag is indicated as pseudo both the primary tag and the
 | 
| 
 | 
    56 accessory 'gene' primary tag (for CDS/RNA features with option
 | 
| 
 | 
    57 B<-g>) will include a '/pseudo' qualifier in the resulting B<tbl>
 | 
| 
 | 
    58 file. B<Pseudo-genes> are indicated by 'pseudo' in the 'primary_tag'
 | 
| 
 | 
    59 column, thus the 'pseudo' column is ignored in these cases.
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 B<Second>, tag '/gene_desc' is reserved for the 'product' values of
 | 
| 
 | 
    62 pseudo-genes, thus a 'gene_desc' column in a tab file will be
 | 
| 
 | 
    63 ignored in the conversion to tbl.
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 B<Third>, column 'protein_id' in a tab file will also be ignored in
 | 
| 
 | 
    66 the conversion. '/protein_id' values are created from option B<-p>
 | 
| 
 | 
    67 and the locus_tag for each CDS primary feature.
 | 
| 
 | 
    68 
 | 
| 
 | 
    69 Furthermore, with option B<-s> G2L-style spreadsheet formulas
 | 
| 
 | 
    70 (L<Goettingen Genomics
 | 
| 
 | 
    71 Laboratory|http://appmibio.uni-goettingen.de/>) can be included with
 | 
| 
 | 
    72 additional columns, 'spreadsheet_locus_tag', 'position', 'distance',
 | 
| 
 | 
    73 'gene_number', and 'contig_order'. These columns will not be
 | 
| 
 | 
    74 included in a conversion to the tbl format. Thus, if you want to
 | 
| 
 | 
    75 include e.g. the locus_tags from the formula in column
 | 
| 
 | 
    76 'spreadsheet_locus_tag' in the resulting tbl file copy the B<values>
 | 
| 
 | 
    77 to the column 'locus_tag'!
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 To illustrate the process two example files are included in the
 | 
| 
 | 
    80 repository, F<example.tbl> and F<example2.tab>, which are
 | 
| 
 | 
    81 interconvertible (see L</"EXAMPLES"> below).
 | 
| 
 | 
    82 
 | 
| 
 | 
    83 B<Warning>, be aware of possible errors introduced by automatic
 | 
| 
 | 
    84 format conversions using a spreadsheet software like MS Excel, see
 | 
| 
 | 
    85 e.g. Zeeberg et al. 2004
 | 
| 
 | 
    86 (L<http://www.ncbi.nlm.nih.gov/pubmed/15214961>).
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 For more information regarding the feature table and the submission
 | 
| 
 | 
    89 process see NCBI's L<prokaryotic annotation
 | 
| 
 | 
    90 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit> and the
 | 
| 
 | 
    91 L<bacterial genome submission
 | 
| 
 | 
    92 guide|http://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation>.
 | 
| 
 | 
    93 
 | 
| 
 | 
    94 =head1 OPTIONS
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 =head2 Mandatory options
 | 
| 
 | 
    97 
 | 
| 
 | 
    98 =over 20
 | 
| 
 | 
    99 
 | 
| 
 | 
   100 =item B<-m>=I<tbl2tab|tab2tbl>, B<-mode>=I<tbl2tab|tab2tbl>
 | 
| 
 | 
   101 
 | 
| 
 | 
   102 Conversion mode, either 'tbl2tab' or 'tab2tbl' [default = 'tbl2tab']
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 =item B<-i>=I<str>, B<-input>=I<str>
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 Input tbl or tab file to be converted to the other format
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 =back
 | 
| 
 | 
   109 
 | 
| 
 | 
   110 =head2 Optional options
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 =over 20
 | 
| 
 | 
   113 
 | 
| 
 | 
   114 =item B<-h>, B<-help>
 | 
| 
 | 
   115 
 | 
| 
 | 
   116 Help (perldoc POD)
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 =item B<-v>, B<-version>
 | 
| 
 | 
   119 
 | 
| 
 | 
   120 Print version number to C<STDERR>
 | 
| 
 | 
   121 
 | 
| 
 | 
   122 =back
 | 
| 
 | 
   123 
 | 
| 
 | 
   124 =head3 Mode B<tbl2tab>
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 =over 20
 | 
| 
 | 
   127 
 | 
| 
 | 
   128 =item B<-l>=I<str>, B<-locus_prefix>=I<str>
 | 
| 
 | 
   129 
 | 
| 
 | 
   130 Only in combination with option B<-s> and there mandatory to include
 | 
| 
 | 
   131 the locus_tag prefix in the formula for column 'spreadsheet_locus_tag'
 | 
| 
 | 
   132 
 | 
| 
 | 
   133 =item B<-c>, B<-concat>
 | 
| 
 | 
   134 
 | 
| 
 | 
   135 Concatenate values of identical tags within one primary tag with '~'
 | 
| 
 | 
   136 (e.g. several '/EC_number' or '/inference' tags)
 | 
| 
 | 
   137 
 | 
| 
 | 
   138 =item B<-e>=I<str>, B<-empty>=I<str>
 | 
| 
 | 
   139 
 | 
| 
 | 
   140 String used for primary features without value for a tag [default = '']
 | 
| 
 | 
   141 
 | 
| 
 | 
   142 =item B<-s>, B<-spreadsheet>
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 Include formulas for spreadsheet editing
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 =item B<-f>=I<e|g>, B<-formula_lang>=I<e|g>
 | 
| 
 | 
   147 
 | 
| 
 | 
   148 Syntax language of the spreadsheet formulas, either 'English' or
 | 
| 
 | 
   149 'German'. If you're still encountering problems with the formulas
 | 
| 
 | 
   150 set the decimal and thousands separator manually in the options of
 | 
| 
 | 
   151 the spreadsheet software (instead of using the operating system
 | 
| 
 | 
   152 separators). [default = 'e']
 | 
| 
 | 
   153 
 | 
| 
 | 
   154 =back
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 =head3 Mode B<tab2tbl>
 | 
| 
 | 
   157 
 | 
| 
 | 
   158 =over 20
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 =item B<-l>=I<str>, B<-locus_prefix>=I<str>
 | 
| 
 | 
   161 
 | 
| 
 | 
   162 Prefix to the SeqID if not present already in the SeqID
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 =item B<-g>, B<-gene>
 | 
| 
 | 
   165 
 | 
| 
 | 
   166 Include accessory 'gene' primary tags (with '/gene', '/locus_tag'
 | 
| 
 | 
   167 and possibly '/pseudo' tags) for 'CDS/RNA' primary tags; NCBI standard
 | 
| 
 | 
   168 
 | 
| 
 | 
   169 =item B<-t>, B<-tags_full>
 | 
| 
 | 
   170 
 | 
| 
 | 
   171 Only in combination with option B<-g>, include '/gene' and
 | 
| 
 | 
   172 '/locus_tag' tags additionally in primary tag, not only in accessory
 | 
| 
 | 
   173 'gene' primary tag
 | 
| 
 | 
   174 
 | 
| 
 | 
   175 =item B<-p>=I<str>, B<-protein_id_prefix>=I<str>
 | 
| 
 | 
   176 
 | 
| 
 | 
   177 Prefix for '/protein_id' tags; don't forget the double quotes for
 | 
| 
 | 
   178 the string, otherwise the shell will intepret as pipe [default =
 | 
| 
 | 
   179 'gnl|goetting|']
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 =back
 | 
| 
 | 
   182 
 | 
| 
 | 
   183 =head1 OUTPUT
 | 
| 
 | 
   184 
 | 
| 
 | 
   185 =over 20
 | 
| 
 | 
   186 
 | 
| 
 | 
   187 =item F<*.tab|tbl>
 | 
| 
 | 
   188 
 | 
| 
 | 
   189 Result file in the opposite format
 | 
| 
 | 
   190 
 | 
| 
 | 
   191 =item (F<hypo_putative_genes.txt>)
 | 
| 
 | 
   192 
 | 
| 
 | 
   193 Created in mode 'tab2tbl', indicates if CDSs are annotated as
 | 
| 
 | 
   194 'hypothetical/putative/predicted protein' but still have a gene name
 | 
| 
 | 
   195 
 | 
| 
 | 
   196 =back
 | 
| 
 | 
   197 
 | 
| 
 | 
   198 =head1 EXAMPLES
 | 
| 
 | 
   199 
 | 
| 
 | 
   200 =over
 | 
| 
 | 
   201 
 | 
| 
 | 
   202 =item C<perl tbl2tab.pl -m tbl2tab -i example.tbl -s -l EPE>
 | 
| 
 | 
   203 
 | 
| 
 | 
   204 =item C<perl tbl2tab.pl -m tab2tbl -i example2.tab -g -l EPE>
 | 
| 
 | 
   205 
 | 
| 
 | 
   206 =back
 | 
| 
 | 
   207 
 | 
| 
 | 
   208 =head1 VERSION
 | 
| 
 | 
   209 
 | 
| 
 | 
   210  0.2                                               update: 29-10-2014
 | 
| 
 | 
   211  0.1                                                       24-06-2014
 | 
| 
 | 
   212 
 | 
| 
 | 
   213 =head1 AUTHOR
 | 
| 
 | 
   214 
 | 
| 
 | 
   215  Andreas Leimbach                               aleimba[at]gmx[dot]de
 | 
| 
 | 
   216 
 | 
| 
 | 
   217 =head1 LICENSE
 | 
| 
 | 
   218 
 | 
| 
 | 
   219 This program is free software: you can redistribute it and/or modify
 | 
| 
 | 
   220 it under the terms of the GNU General Public License as published by
 | 
| 
 | 
   221 the Free Software Foundation; either version 3 (GPLv3) of the License,
 | 
| 
 | 
   222 or (at your option) any later version.
 | 
| 
 | 
   223 
 | 
| 
 | 
   224 This program is distributed in the hope that it will be useful, but
 | 
| 
 | 
   225 WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
   226 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 | 
| 
 | 
   227 General Public License for more details.
 | 
| 
 | 
   228 
 | 
| 
 | 
   229 You should have received a copy of the GNU General Public License
 | 
| 
 | 
   230 along with this program. If not, see L<http://www.gnu.org/licenses/>.
 | 
| 
 | 
   231 
 | 
| 
 | 
   232 =cut
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 
 | 
| 
 | 
   235 ########
 | 
| 
 | 
   236 # MAIN #
 | 
| 
 | 
   237 ########
 | 
| 
 | 
   238 
 | 
| 
 | 
   239 use strict;
 | 
| 
 | 
   240 use warnings;
 | 
| 
 | 
   241 use autodie;
 | 
| 
 | 
   242 use Getopt::Long;
 | 
| 
 | 
   243 use Pod::Usage;
 | 
| 
 | 
   244 
 | 
| 
 | 
   245 
 | 
| 
 | 
   246 
 | 
| 
 | 
   247 ### Get options with Getopt::Long, works also abbreviated and with two "--": -i, --i, -input ...
 | 
| 
 | 
   248 my $Input_File; # input file
 | 
| 
 | 
   249 my $Mode = 'tbl2tab'; # mode of script, i.e. either convert from tbl2tab or from tab2tbl; default 'tbl2tab'
 | 
| 
 | 
   250 my $Locus_Prefix = ''; # required for option 'spreadsheet' in mode 'tbl2tab', in mode 'tab2tbl' optional
 | 
| 
 | 
   251 my $Opt_Concat; # optionally, concatenate values of the same tag within one primary tag in a tbl file in one column in the resulting tab file with '~' (e.g. several 'EC_number' tags etc.)
 | 
| 
 | 
   252 my $Empty = ''; # optionally, set what should be used for tags without a value in resulting tab file; default is nothing
 | 
| 
 | 
   253 my $Opt_Spreadsheet; # optionally, include formulas for spreadsheet editing (e.g. Libre Office, MS Excel)
 | 
| 
 | 
   254 my $Formula_Lang_Spreadsheet = 'e'; # optionally, either German or English formulas in Spreadsheet option; default 'e' for English
 | 
| 
 | 
   255 my $Opt_Gene; # optionally, include accessory gene primary tags (with '/gene' and '/locus_tag' [and '/pseudo'] tags) for CDS|RNA primary tags
 | 
| 
 | 
   256 my $Opt_Tags_Full; # optionally, include '/gene' and '/locus_tag' additionally in primary tag not only accessory 'gene' primary tag
 | 
| 
 | 
   257 my $Protein_Id_Prefix = 'gnl|goetting|'; # optionally give a different string to prefix the '/protein_id' tags
 | 
| 
 | 
   258 my $VERSION = 0.2;
 | 
| 
 | 
   259 my ($Opt_Version, $Opt_Help);
 | 
| 
 | 
   260 GetOptions ('input=s' => \$Input_File,
 | 
| 
 | 
   261             'mode=s' => \$Mode,
 | 
| 
 | 
   262             'locus_prefix:s' => \$Locus_Prefix,
 | 
| 
 | 
   263             'concat' => \$Opt_Concat,
 | 
| 
 | 
   264             'empty:s' => \$Empty,
 | 
| 
 | 
   265             'spreadsheet' => \$Opt_Spreadsheet,
 | 
| 
 | 
   266             'formula_lang:s' => \$Formula_Lang_Spreadsheet,
 | 
| 
 | 
   267             'gene' => \$Opt_Gene,
 | 
| 
 | 
   268             'tags_full' => \$Opt_Tags_Full,
 | 
| 
 | 
   269             'protein_id_prefix:s' => \$Protein_Id_Prefix,
 | 
| 
 | 
   270             'version' => \$Opt_Version,
 | 
| 
 | 
   271             'help|?' => \$Opt_Help);
 | 
| 
 | 
   272 
 | 
| 
 | 
   273 
 | 
| 
 | 
   274 
 | 
| 
 | 
   275 ### Run perldoc on POD
 | 
| 
 | 
   276 pod2usage(-verbose => 2) if ($Opt_Help);
 | 
| 
 | 
   277 die "$0 $VERSION\n" if ($Opt_Version);
 | 
| 
 | 
   278 if (!$Input_File) {
 | 
| 
 | 
   279     my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
 | 
| 
 | 
   280     pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
 | 
| 
 | 
   281 } elsif (!($Mode =~ /tbl2tab/i || $Mode =~ /tab2tbl/i)) { # case-insensitive
 | 
| 
 | 
   282     my $warning = "\n### Fatal error: Incorrect run mode with option '-m' given! Please choose either 'tbl2tab' or 'tab2tbl' for '-m'!\n";
 | 
| 
 | 
   283     pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
 | 
| 
 | 
   284 }
 | 
| 
 | 
   285 
 | 
| 
 | 
   286 
 | 
| 
 | 
   287 
 | 
| 
 | 
   288 ### Enforce mandatory or optional options
 | 
| 
 | 
   289 if ($Mode =~ /tbl2tab/i && ($Opt_Gene || $Opt_Tags_Full || $Protein_Id_Prefix ne 'gnl|goetting|')) {
 | 
| 
 | 
   290     warn "\nIncompatible option(s) '-g', '-p', or '-t' set with mode 'tbl2tab'. Ignoring the option(s)!\n";
 | 
| 
 | 
   291 } elsif ($Mode =~ /tab2tbl/i && ($Opt_Concat || $Empty || $Opt_Spreadsheet || $Formula_Lang_Spreadsheet ne 'e')) {
 | 
| 
 | 
   292     warn "\nIncompatible option(s) '-c', '-e', '-f', or '-s' set with mode 'tab2tbl'. Ignoring the option(s)!\n";
 | 
| 
 | 
   293     $Formula_Lang_Spreadsheet = 'e' if ($Formula_Lang_Spreadsheet ne 'e'); # avoid die with error below if option not set correctly
 | 
| 
 | 
   294 }
 | 
| 
 | 
   295 
 | 
| 
 | 
   296 if ($Mode =~ /tbl2tab/i && ($Locus_Prefix || $Formula_Lang_Spreadsheet ne 'e') && !$Opt_Spreadsheet) {
 | 
| 
 | 
   297     warn "\nOption(s) '-l' or '-f' set, but not '-s'. Forcing option '-s'!\n";
 | 
| 
 | 
   298     $Opt_Spreadsheet = 1;
 | 
| 
 | 
   299 }
 | 
| 
 | 
   300 if ($Mode =~ /tbl2tab/i && !$Locus_Prefix && $Opt_Spreadsheet) {
 | 
| 
 | 
   301     warn "\nOption '-s' set, but not '-l'. Please give a prefix for the locus tags: ";
 | 
| 
 | 
   302     chomp($Locus_Prefix = <>);
 | 
| 
 | 
   303 }
 | 
| 
 | 
   304 
 | 
| 
 | 
   305 if ($Formula_Lang_Spreadsheet !~ /^(e|g)/i) {
 | 
| 
 | 
   306     die "\n### Fatal error: Incorrect language for option '-f' given! Please choose either 'e|eng' or 'g|ger' for '-f'!\n";
 | 
| 
 | 
   307 }
 | 
| 
 | 
   308 
 | 
| 
 | 
   309 if ($Mode =~ /tab2tbl/i && !$Opt_Gene && $Opt_Tags_Full) {
 | 
| 
 | 
   310     warn "\nOption '-t' set, but not '-g'. Forcing option '-g'!\n";
 | 
| 
 | 
   311     $Opt_Gene = 1;
 | 
| 
 | 
   312 }
 | 
| 
 | 
   313 
 | 
| 
 | 
   314 
 | 
| 
 | 
   315 
 | 
| 
 | 
   316 ### Read in tbl or tab-separated data and write to result file in the opposite format
 | 
| 
 | 
   317 my $Out_File = $Input_File;
 | 
| 
 | 
   318 $Out_File =~ s/^(.+)\.\w+$/$1/; # strip filename extension
 | 
| 
 | 
   319 my $Error_File = 'hypo_putative_genes.txt';
 | 
| 
 | 
   320 if ($Mode =~ /tbl2tab/i) {
 | 
| 
 | 
   321     my ($data_hash_ref, $tags_max_count_hash_ref) = read_tbl(); # subroutine
 | 
| 
 | 
   322     $Out_File .= '.tab';
 | 
| 
 | 
   323     write_tab($data_hash_ref, $tags_max_count_hash_ref); # subroutine
 | 
| 
 | 
   324 
 | 
| 
 | 
   325 } elsif ($Mode =~ /tab2tbl/i) {
 | 
| 
 | 
   326     $Out_File .= '.tbl';
 | 
| 
 | 
   327     read_tab_write_tbl(); # subroutine
 | 
| 
 | 
   328 }
 | 
| 
 | 
   329 
 | 
| 
 | 
   330 
 | 
| 
 | 
   331 
 | 
| 
 | 
   332 ### Message which file was created
 | 
| 
 | 
   333 if ($Mode =~ /tbl2tab/i) {
 | 
| 
 | 
   334     print "Input tbl file '$Input_File' was converted to tab output file '$Out_File'!\n";
 | 
| 
 | 
   335 } elsif ($Mode =~ /tab2tbl/i) {
 | 
| 
 | 
   336     print "Input tab file '$Input_File' was converted to tbl output file '$Out_File'!\n";
 | 
| 
 | 
   337 }
 | 
| 
 | 
   338 
 | 
| 
 | 
   339 if (-e $Error_File) {
 | 
| 
 | 
   340     if (-s $Error_File >= 40) { # smaller than just the header, which should be 27 bytes
 | 
| 
 | 
   341         warn "\n### Warning: CDSs found that are annotated with 'hypothetical|putative|predicted protein' but still include a '/gene' tag, see file '$Error_File'!\n";
 | 
| 
 | 
   342     } elsif (-s $Error_File < 40) {
 | 
| 
 | 
   343         unlink $Error_File;
 | 
| 
 | 
   344     }
 | 
| 
 | 
   345 }
 | 
| 
 | 
   346 
 | 
| 
 | 
   347 
 | 
| 
 | 
   348 exit;
 | 
| 
 | 
   349 
 | 
| 
 | 
   350 ###############
 | 
| 
 | 
   351 # Subroutines #
 | 
| 
 | 
   352 ###############
 | 
| 
 | 
   353 
 | 
| 
 | 
   354 ### Subroutine to test for file existence and give warning to STDERR
 | 
| 
 | 
   355 sub file_exist {
 | 
| 
 | 
   356     my $file = shift;
 | 
| 
 | 
   357     if (-e $file) {
 | 
| 
 | 
   358         warn "\nThe result file \'$file\' exists already and will be overwritten!\n\n";
 | 
| 
 | 
   359         return 1;
 | 
| 
 | 
   360     }
 | 
| 
 | 
   361     return 0;
 | 
| 
 | 
   362 }
 | 
| 
 | 
   363 
 | 
| 
 | 
   364 
 | 
| 
 | 
   365 
 | 
| 
 | 
   366 ### Print tag values to tab result file
 | 
| 
 | 
   367 sub print_tag2tab {
 | 
| 
 | 
   368     my ($tag, $data_hash_ref, $seq_id, $pos, $tag_max_count) = @_;
 | 
| 
 | 
   369 
 | 
| 
 | 
   370     if ($Opt_Concat) { # values concatenated by '~' in $data_hash_ref
 | 
| 
 | 
   371         if ($data_hash_ref->{$seq_id}->{$pos}->{$tag}) {
 | 
| 
 | 
   372             print "\t$data_hash_ref->{$seq_id}->{$pos}->{$tag}";
 | 
| 
 | 
   373             return 1;
 | 
| 
 | 
   374         } else {
 | 
| 
 | 
   375             print "\t$Empty";
 | 
| 
 | 
   376             return 1;
 | 
| 
 | 
   377         }
 | 
| 
 | 
   378 
 | 
| 
 | 
   379     } elsif (!$Opt_Concat) { # split concatenated values in individual values
 | 
| 
 | 
   380         my @values = split(/~/, $data_hash_ref->{$seq_id}->{$pos}->{$tag}) if ($data_hash_ref->{$seq_id}->{$pos}->{$tag});
 | 
| 
 | 
   381         if (@values) {
 | 
| 
 | 
   382             foreach (@values) {
 | 
| 
 | 
   383                 print "\t$_";
 | 
| 
 | 
   384             }
 | 
| 
 | 
   385             print "\t$Empty" x ($tag_max_count - @values); # fill residual columns till maximum occurrence
 | 
| 
 | 
   386         } else {
 | 
| 
 | 
   387             print "\t$Empty" x $tag_max_count;
 | 
| 
 | 
   388         }
 | 
| 
 | 
   389         return 1;
 | 
| 
 | 
   390     }
 | 
| 
 | 
   391 
 | 
| 
 | 
   392     return 0;
 | 
| 
 | 
   393 }
 | 
| 
 | 
   394 
 | 
| 
 | 
   395 
 | 
| 
 | 
   396 
 | 
| 
 | 
   397 ### Print tag values to tbl result file
 | 
| 
 | 
   398 sub print_tag2tbl {
 | 
| 
 | 
   399     my ($tag, $value) = @_;
 | 
| 
 | 
   400     return 0 if ($value =~ /^$Empty$/);
 | 
| 
 | 
   401     if ($tag =~ /pseudo/) {
 | 
| 
 | 
   402         print "\t\t\t$tag\n";
 | 
| 
 | 
   403         return 1;
 | 
| 
 | 
   404     }
 | 
| 
 | 
   405 
 | 
| 
 | 
   406     ### remove quotations from values introduced by Excel by saving as tab-separated file:
 | 
| 
 | 
   407     ### https://office.microsoft.com/en-001/excel-help/excel-formatting-and-features-that-are-not-transferred-to-other-file-formats-HP010014105.aspx
 | 
| 
 | 
   408     ### - if a cell contains a comma, the cell contents are enclosed in double quotation marks
 | 
| 
 | 
   409     ### - if the data contains a quotation mark, double quotation marks will replace the quotation mark, and the cell contents are also enclosed in double quotation marks
 | 
| 
 | 
   410     $value =~ s/""/"/g;
 | 
| 
 | 
   411     $value =~ s/^"//;
 | 
| 
 | 
   412     $value =~ s/"$//;
 | 
| 
 | 
   413 
 | 
| 
 | 
   414     foreach (split(/~/, $value)) {
 | 
| 
 | 
   415         print "\t\t\t$tag\t$_\n";
 | 
| 
 | 
   416     }
 | 
| 
 | 
   417     return 1;
 | 
| 
 | 
   418 }
 | 
| 
 | 
   419 
 | 
| 
 | 
   420 
 | 
| 
 | 
   421 ### Read in data from tab input file and write it to tbl output file
 | 
| 
 | 
   422 sub read_tab_write_tbl {
 | 
| 
 | 
   423     file_exist($Error_File); # subroutine
 | 
| 
 | 
   424     open (my $error_file_fh, ">", $Error_File);
 | 
| 
 | 
   425     print $error_file_fh "row\tlocus_tag\tgene\tproduct\n";
 | 
| 
 | 
   426 
 | 
| 
 | 
   427     open (my $input_file_fh, "<", $Input_File);
 | 
| 
 | 
   428     my $header = <$input_file_fh>;
 | 
| 
 | 
   429     $header =~ s/\R/\012/; # convert line to unix-style line endings
 | 
| 
 | 
   430     chomp $header;
 | 
| 
 | 
   431     if ($header !~ /^seq_id\tstart\tstop\tprimary_tag\t/) { # check if tbl file starts with mandatory header fields or quit
 | 
| 
 | 
   432         die "\n### Fatal error: Input tab file '$Input_File' doesn't start with the mandatory 'seq_id', 'start', 'stop', and 'primary_tag' tab-separated header fields. Sure this is a valid tab file?\nExiting program!\n\n";
 | 
| 
 | 
   433     }
 | 
| 
 | 
   434 
 | 
| 
 | 
   435     my @tags;
 | 
| 
 | 
   436     foreach (split(/\t/, $header)) {
 | 
| 
 | 
   437         last if (/spreadsheet_locus_tag/); # skip all optional extra spreadsheet columns
 | 
| 
 | 
   438         push(@tags, $_); # store all header fields/columns to associate with each field in each line
 | 
| 
 | 
   439     }
 | 
| 
 | 
   440 
 | 
| 
 | 
   441     file_exist($Out_File); # subroutine
 | 
| 
 | 
   442     open (my $out_file_fh, ">", $Out_File);
 | 
| 
 | 
   443     select $out_file_fh; # select fh for standard print/f output
 | 
| 
 | 
   444 
 | 
| 
 | 
   445     my $row = 1; # count row numbers of tab input file for $Error_File (start with '1' as header already parsed above)
 | 
| 
 | 
   446     my $seq_id = ''; # store previous SeqID for multi-contig/replicon tab files
 | 
| 
 | 
   447     while (<$input_file_fh>) {
 | 
| 
 | 
   448         $row++;
 | 
| 
 | 
   449         $_ =~ s/\R/\012/; # convert line to unix-style line ending
 | 
| 
 | 
   450         chomp;
 | 
| 
 | 
   451         next if ($_ =~ /^\s+$/ || $_ =~ /^$/); # skip empty lines
 | 
| 
 | 
   452 
 | 
| 
 | 
   453         my ($locus_tag, $gene, $hypo_putative) = ('', '', ''); # needed for $Error_File
 | 
| 
 | 
   454 
 | 
| 
 | 
   455         my @cells = split(/\t/, $_);
 | 
| 
 | 
   456         for (my $i = 0; $i < 4; $i++) { # check each row for mandatory fields
 | 
| 
 | 
   457             if ($cells[$i] =~ /^$/) {
 | 
| 
 | 
   458                 close $out_file_fh;
 | 
| 
 | 
   459                 unlink $Out_File;
 | 
| 
 | 
   460                 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a value for one of the mandatory fields 'seq_id', 'start', 'stop', or 'primary_tag'!\nExiting program!\n\n";
 | 
| 
 | 
   461             }
 | 
| 
 | 
   462         }
 | 
| 
 | 
   463 
 | 
| 
 | 
   464         ### print SeqID
 | 
| 
 | 
   465         if ($cells[0] ne $seq_id) { # print new contig for multi-contig/replicon tab files
 | 
| 
 | 
   466             $seq_id = $cells[0];
 | 
| 
 | 
   467             $cells[0] = $Locus_Prefix."_".$cells[0] if ($cells[0] !~ /$Locus_Prefix/ && $Locus_Prefix); # append locus_tag prefix only if not present already
 | 
| 
 | 
   468             print ">Feature $cells[0]\n";
 | 
| 
 | 
   469         }
 | 
| 
 | 
   470 
 | 
| 
 | 
   471         ### print accessory 'gene' primary tags with '/locus_tag', '/gene', and potential '/pseudo' tags
 | 
| 
 | 
   472         if ($Opt_Gene && $cells[3] =~ /CDS|RNA/) { # accessory 'gene' primary tags only for CDS and RNA (rRNA, tRNA ...) features
 | 
| 
 | 
   473             print "$cells[1]\t$cells[2]\tgene\n";
 | 
| 
 | 
   474             my $column_count = 0;
 | 
| 
 | 
   475             foreach my $tag (@tags) {
 | 
| 
 | 
   476                 if ($tag =~ /locus_tag/ && $cells[$column_count] =~ /^$/) { # CDSs|RNAs mandatory need a '/locus_tag' with option '-g'
 | 
| 
 | 
   477                     close $out_file_fh;
 | 
| 
 | 
   478                     unlink $Out_File;
 | 
| 
 | 
   479                     die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
 | 
| 
 | 
   480                 }
 | 
| 
 | 
   481                 print_tag2tbl($tag, $cells[$column_count]) if ($tag =~ /locus_tag|^gene$|pseudo/); # subroutine; '^gene$' needed, so 'gene_desc' isn't hit (see below)
 | 
| 
 | 
   482                 $column_count++;
 | 
| 
 | 
   483             }
 | 
| 
 | 
   484         }
 | 
| 
 | 
   485 
 | 
| 
 | 
   486         ### print primary tag
 | 
| 
 | 
   487         print "$cells[1]\t$cells[2]"; # start\tstop
 | 
| 
 | 
   488         if ($cells[3] =~ /pseudo/) { # pseudo-gene should include '/pseudo' tag
 | 
| 
 | 
   489             print "\tgene\n";
 | 
| 
 | 
   490             print "\t\t\tpseudo\n";
 | 
| 
 | 
   491         } else {
 | 
| 
 | 
   492             print "\t$cells[3]\n";
 | 
| 
 | 
   493         }
 | 
| 
 | 
   494 
 | 
| 
 | 
   495         ### print tags with values
 | 
| 
 | 
   496         for (my $i = 4; $i < @tags; $i++) { # start with field 5 of array with header fields/columns (the first 4 are mandatory, see above)
 | 
| 
 | 
   497             next if ($tags[$i] =~ /gene_desc/); # skip 'gene_desc' fields in tab file, reserved for pseudo-genes (see below)
 | 
| 
 | 
   498 
 | 
| 
 | 
   499             ### enforce mandatory tags for CDS primary tags
 | 
| 
 | 
   500             if ($tags[$i] =~ /product/ && $cells[3] =~ /CDS/) {
 | 
| 
 | 
   501                 if ($cells[$i] =~ /(hypothetical|putative|predicted) protein/) { # needed for $Error_File
 | 
| 
 | 
   502                     $hypo_putative = $cells[$i];
 | 
| 
 | 
   503                 } elsif ($cells[$i] =~ /^$/) { # CDSs mandatory need a value for '/product'
 | 
| 
 | 
   504                     close $out_file_fh;
 | 
| 
 | 
   505                     unlink $Out_File;
 | 
| 
 | 
   506                     die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for CDS primary tags!\nExiting program!\n\n";
 | 
| 
 | 
   507                 }
 | 
| 
 | 
   508             }
 | 
| 
 | 
   509 
 | 
| 
 | 
   510             if ($tags[$i] =~ /locus_tag/ && $cells[3] =~ /CDS/) { # '/protein_id' mandatory for 'CDS' primary tags
 | 
| 
 | 
   511                 $locus_tag = $cells[$i]; # needed for $Error_File
 | 
| 
 | 
   512                 my $protein_id = "$Protein_Id_Prefix".$cells[$i];
 | 
| 
 | 
   513                 print_tag2tbl('protein_id', $protein_id); # subroutine
 | 
| 
 | 
   514             }
 | 
| 
 | 
   515             next if ($tags[$i] =~ /protein_id/); # skip 'protein_id' field in tab file as they should be created from the 'locus_tag' column
 | 
| 
 | 
   516 
 | 
| 
 | 
   517             $gene = $cells[$i] if ($tags[$i] =~ /^gene$/ && $cells[3] =~ /CDS/); # needed for $Error_File
 | 
| 
 | 
   518 
 | 
| 
 | 
   519             ### enforce mandatory tags for CDS/RNA primary tags
 | 
| 
 | 
   520             next if ($tags[$i] =~ /locus_tag|^gene$/ && $Opt_Gene && !$Opt_Tags_Full && $cells[3] =~ /CDS|RNA/); # skip '/locus_tag' and '/gene' tags if accessory gene primary tags are present for CDS|RNA features (except option '-t' is set)
 | 
| 
 | 
   521             if ($tags[$i] =~ /product/ && $cells[3] =~ /RNA/ && $cells[$i] =~ /^$/) { # RNAs mandatory need a value for '/product'
 | 
| 
 | 
   522                 close $out_file_fh;
 | 
| 
 | 
   523                 unlink $Out_File;
 | 
| 
 | 
   524                 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'product' value which is mandatory for RNA primary tags!\nExiting program!\n\n";
 | 
| 
 | 
   525             }
 | 
| 
 | 
   526 
 | 
| 
 | 
   527             ### enforce mandatory tag for pseudo-genes (have 'pseudo' as primary tag in tab file)
 | 
| 
 | 
   528             if ($tags[$i] =~ /locus_tag/ && $Opt_Gene && $cells[3] =~ /pseudo/ && $cells[$i] =~ /^$/) { # pseudo-genes mandatory need a '/locus_tag' with option '-g'
 | 
| 
 | 
   529                 close $out_file_fh;
 | 
| 
 | 
   530                 unlink $Out_File;
 | 
| 
 | 
   531                 die "\n### Fatal error: Row $row of input tab file '$Input_File' is missing a 'locus_tag' which is mandatory for option '-g'!\nExiting program!\n\n";
 | 
| 
 | 
   532             }
 | 
| 
 | 
   533 
 | 
| 
 | 
   534             ### the rest
 | 
| 
 | 
   535             if ($tags[$i] =~ /product/ && $cells[3] =~ /pseudo/) { # write 'product' values for pseudo-genes to '/gene_desc' tags
 | 
| 
 | 
   536                 print_tag2tbl('gene_desc', $cells[$i]); # subroutine
 | 
| 
 | 
   537             } elsif ($tags[$i] =~ /pseudo/ && $cells[3] =~ /pseudo/) { # skip 'pseudo' tag if pseudo-gene
 | 
| 
 | 
   538                 next;
 | 
| 
 | 
   539             } else {
 | 
| 
 | 
   540                 print_tag2tbl($tags[$i], $cells[$i]); # subroutine
 | 
| 
 | 
   541             }
 | 
| 
 | 
   542 
 | 
| 
 | 
   543         }
 | 
| 
 | 
   544         print $error_file_fh "$row\t$locus_tag\t$gene\t$hypo_putative\n" if ($hypo_putative && $gene);
 | 
| 
 | 
   545     }
 | 
| 
 | 
   546 
 | 
| 
 | 
   547     select STDOUT;
 | 
| 
 | 
   548     close $input_file_fh;
 | 
| 
 | 
   549     close $error_file_fh;
 | 
| 
 | 
   550     close $out_file_fh;
 | 
| 
 | 
   551     return 1;
 | 
| 
 | 
   552 }
 | 
| 
 | 
   553 
 | 
| 
 | 
   554 
 | 
| 
 | 
   555 
 | 
| 
 | 
   556 ### Read in data from tbl input file
 | 
| 
 | 
   557 sub read_tbl {
 | 
| 
 | 
   558     open (my $input_file_fh, "<", $Input_File);
 | 
| 
 | 
   559 
 | 
| 
 | 
   560     my $seq_id = <$input_file_fh>; # SeqID
 | 
| 
 | 
   561     $seq_id =~ s/\R/\012/; # convert line to unix-style line endings
 | 
| 
 | 
   562     chomp $seq_id;
 | 
| 
 | 
   563     if ($seq_id !~ /^>Feature/) { # check if tbl file starts with mandatory '>Feature' and get first SeqID
 | 
| 
 | 
   564         die "\n### Fatal error: tbl file doesn't start with a '>Feature SeqID' line. Sure this is a valid tbl file?\nExiting program!\n\n";
 | 
| 
 | 
   565     } else {
 | 
| 
 | 
   566         $seq_id =~ s/>Feature (\S+)\s*$/$1/; # only use non-whitespace characters as SeqID
 | 
| 
 | 
   567     }
 | 
| 
 | 
   568 
 | 
| 
 | 
   569     my %data; # hash-in-hash-in-hash to store tbl input data
 | 
| 
 | 
   570     my $pos_key; # store start..stop for each primary tag and use as key in %data
 | 
| 
 | 
   571     my $primary_tag; # store previous primary tag to determine if values for repeatedly occuring tags should be concatenated
 | 
| 
 | 
   572     my %tags_max_count; # hash to store all occuring tags with maximal number of presence (within a single primary tag) in the tbl file for final tab column headers
 | 
| 
 | 
   573     my @tags; # array to store all tags of each primary tag, supplement to %tags_max_count
 | 
| 
 | 
   574 
 | 
| 
 | 
   575     while (<$input_file_fh>) {
 | 
| 
 | 
   576         $_ =~ s/\R/\012/; # convert line to unix-style line ending
 | 
| 
 | 
   577         chomp;
 | 
| 
 | 
   578         next if ($_ =~ /^\s+$/); # skip empty lines
 | 
| 
 | 
   579         my @fields = split(/\t/, $_);
 | 
| 
 | 
   580 
 | 
| 
 | 
   581         ### get next SeqID from '>Feature' line for multi-contig/replicon tbl files
 | 
| 
 | 
   582         if ($fields[0] =~ /^>Feature (\S+)\s*$/) {
 | 
| 
 | 
   583             $seq_id = $1;
 | 
| 
 | 
   584 
 | 
| 
 | 
   585         ### get primary tags/features and fill %tags_max_count
 | 
| 
 | 
   586         } elsif ($fields[0] =~ /^\d+$/) { # $fields[2] with primary tag
 | 
| 
 | 
   587             foreach my $tag (@tags) { # fill %tags_max_count for previous primary tag
 | 
| 
 | 
   588                 if ($tags_max_count{$tag}) {
 | 
| 
 | 
   589                     $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
 | 
| 
 | 
   590                 } elsif (!$tags_max_count{$tag}) {
 | 
| 
 | 
   591                     $tags_max_count{$tag} = grep(/$tag/, @tags);
 | 
| 
 | 
   592                 }
 | 
| 
 | 
   593             }
 | 
| 
 | 
   594             @tags = (); # empty tags array for new current primary tag
 | 
| 
 | 
   595 
 | 
| 
 | 
   596             $pos_key = "$fields[0]..$fields[1]"; # position of primary tag used as key for %data, "start..stop"
 | 
| 
 | 
   597             $primary_tag = $fields[2];
 | 
| 
 | 
   598             if (!$data{$seq_id}->{$pos_key}->{'primary_tag'} || $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # if primary tag not present or overwrite accessory 'gene' primary tag
 | 
| 
 | 
   599                 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag; # store data in anonymous hash-in-hash-in-hash
 | 
| 
 | 
   600                 $data{$seq_id}->{$pos_key}->{'start'} = $fields[0]; # to be able to sort afterwards via the start position
 | 
| 
 | 
   601             } elsif ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /pseudo/) { # 'gene' primary tag with '/pseudo' tag will be replaced by 'pseudo' primary tag for pseudo-genes (see below), however if 'gene' primary tag is ACCESSORY to CDS|RNA primary tag replace by this primary tag and include '/pseudo' tag
 | 
| 
 | 
   602                 $data{$seq_id}->{$pos_key}->{'primary_tag'} = $primary_tag;
 | 
| 
 | 
   603                 $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T'; # value 'T' for true
 | 
| 
 | 
   604             }
 | 
| 
 | 
   605 
 | 
| 
 | 
   606         ### get tags/qualifiers
 | 
| 
 | 
   607         } elsif ($fields[3] =~ /^\w+/) {
 | 
| 
 | 
   608             push(@tags, $fields[3]) if ($fields[3] !~ /gene_desc/); # store tags for current primary tag; skip '/gene_desc' as reserved for pseudo-genes (replaced by '/product' see below)
 | 
| 
 | 
   609             if ($fields[3] =~ /pseudo/) {
 | 
| 
 | 
   610                 if ($data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /gene/) { # change 'gene' primary tag of pseudo-genes to 'pseudo' (if accessory 'gene' primary tag will be replaced by *actual* primary tag, see above)
 | 
| 
 | 
   611                     $data{$seq_id}->{$pos_key}->{'primary_tag'} = 'pseudo';
 | 
| 
 | 
   612                 } else { # else include a '/pseudo' tag with value 'T' for true
 | 
| 
 | 
   613                     $data{$seq_id}->{$pos_key}->{'pseudo'} = 'T';
 | 
| 
 | 
   614                 }
 | 
| 
 | 
   615                 next; # next line
 | 
| 
 | 
   616             }
 | 
| 
 | 
   617 
 | 
| 
 | 
   618             ### remove quotations from values introduced by Excel by saving as tab-separated file (see above)
 | 
| 
 | 
   619             $fields[4] =~ s/""/"/g;
 | 
| 
 | 
   620             $fields[4] =~ s/^"//;
 | 
| 
 | 
   621             $fields[4] =~ s/"$//;
 | 
| 
 | 
   622 
 | 
| 
 | 
   623             ### adjust '/gene' and '/product' values to NCBI standard
 | 
| 
 | 
   624             if ($fields[3] =~ /gene/) {
 | 
| 
 | 
   625                 $fields[4] =~ s/(\w+)/\l$1/; # first character of gene name should be lower case
 | 
| 
 | 
   626                 $fields[4] =~ s/^(\w)$/\u$1/; # one letter phage genes should be upper case
 | 
| 
 | 
   627             } elsif ($fields[3] =~ /product/) {
 | 
| 
 | 
   628                 $fields[4] =~ s/\b([A-Z][a-z]{3,})/\l$1/g; # lower the case for '/protein' value initials
 | 
| 
 | 
   629                 $fields[4] =~ s/(rossman|willebrand)/\u$1/; # exception to the rule
 | 
| 
 | 
   630             }
 | 
| 
 | 
   631 
 | 
| 
 | 
   632             if ($fields[3] =~ /gene_desc/) { # '/gene_desc' tags from pseudo-genes replaced by '/product' for resulting tab file
 | 
| 
 | 
   633                 $data{$seq_id}->{$pos_key}->{'product'} = $fields[4];
 | 
| 
 | 
   634                 next;
 | 
| 
 | 
   635             }
 | 
| 
 | 
   636             if ($data{$seq_id}->{$pos_key}->{$fields[3]} && $data{$seq_id}->{$pos_key}->{'primary_tag'} =~ /$primary_tag/) { # tag already exists for this position (e.g. several EC_numbers), concatenate the additional values with '~' as separator only WITHIN the same primary tag (OTHERWISE overwrite in 'else' below)
 | 
| 
 | 
   637                 $data{$seq_id}->{$pos_key}->{$fields[3]} .= '~'.$fields[4];
 | 
| 
 | 
   638             } else { # tag doesn't exist yet or overwrite if current primary tag at the same position of previous (e.g. accessory 'gene' primary tag to CDS/RNA primary tag)
 | 
| 
 | 
   639                 $data{$seq_id}->{$pos_key}->{$fields[3]} = $fields[4];
 | 
| 
 | 
   640             }
 | 
| 
 | 
   641         }
 | 
| 
 | 
   642     }
 | 
| 
 | 
   643     foreach my $tag (@tags) { # fill %tags_max_count for last primary tag
 | 
| 
 | 
   644         if ($tags_max_count{$tag}) {
 | 
| 
 | 
   645             $tags_max_count{$tag} = grep(/$tag/, @tags) if ($tags_max_count{$tag} < grep(/$tag/, @tags));
 | 
| 
 | 
   646         } elsif (!$tags_max_count{$tag}) {
 | 
| 
 | 
   647             $tags_max_count{$tag} = grep(/$tag/, @tags);
 | 
| 
 | 
   648         }
 | 
| 
 | 
   649     }
 | 
| 
 | 
   650     close $input_file_fh;
 | 
| 
 | 
   651     return \%data, \%tags_max_count;
 | 
| 
 | 
   652 }
 | 
| 
 | 
   653 
 | 
| 
 | 
   654 
 | 
| 
 | 
   655 
 | 
| 
 | 
   656 ### Write data to tab output file
 | 
| 
 | 
   657 sub write_tab {
 | 
| 
 | 
   658     my ($data_hash_ref, $tags_max_count_hash_ref) = @_;
 | 
| 
 | 
   659     file_exist($Out_File); # subroutine
 | 
| 
 | 
   660     open (my $out_file_fh, ">", $Out_File);
 | 
| 
 | 
   661     select $out_file_fh; # select fh for standard print/f output
 | 
| 
 | 
   662 
 | 
| 
 | 
   663     ### print header for tab result file
 | 
| 
 | 
   664     print "seq_id\tstart\tstop\tprimary_tag\tlocus_tag"; # mandatory columns/fields in tab file
 | 
| 
 | 
   665     if ($Opt_Concat) {
 | 
| 
 | 
   666         foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
 | 
| 
 | 
   667             print "\t$_" if (!/locus_tag/);
 | 
| 
 | 
   668         }
 | 
| 
 | 
   669     } elsif (!$Opt_Concat) {
 | 
| 
 | 
   670         foreach (sort keys %{$tags_max_count_hash_ref}) {
 | 
| 
 | 
   671             print "\t$_" x $tags_max_count_hash_ref->{$_} if (!/locus_tag/); # print max occurrence (in tbl) of each residual tag
 | 
| 
 | 
   672         }
 | 
| 
 | 
   673     }
 | 
| 
 | 
   674 
 | 
| 
 | 
   675     print "\tspreadsheet_locus_tag\tposition\tdistance\tgene_number\tcontig_order" if ($Opt_Spreadsheet); # print optional spreadsheet header columns
 | 
| 
 | 
   676     print "\n";
 | 
| 
 | 
   677 
 | 
| 
 | 
   678     ### variables for optional spreadsheet formulas
 | 
| 
 | 
   679     my @spread_columns = ("A".."AZ") if ($Opt_Spreadsheet); # columns in spreadsheet software for formulas
 | 
| 
 | 
   680     my ($tags_column_count, $spread_row_count, $spread_contig_order) = (0, 1, 1) if ($Opt_Spreadsheet);
 | 
| 
 | 
   681     if ($Opt_Spreadsheet) {
 | 
| 
 | 
   682         if ($Opt_Concat) {
 | 
| 
 | 
   683             $tags_column_count = (scalar keys %{$tags_max_count_hash_ref}) - scalar grep($_ =~ /locus_tag/, keys %{$tags_max_count_hash_ref}); # subtract tags for correct spreadsheet formulas
 | 
| 
 | 
   684         } elsif (!$Opt_Concat) {
 | 
| 
 | 
   685             foreach (keys %{$tags_max_count_hash_ref}) {
 | 
| 
 | 
   686                 next if ($_ =~ /locus_tag/);
 | 
| 
 | 
   687                 $tags_column_count += $tags_max_count_hash_ref->{$_};
 | 
| 
 | 
   688             }
 | 
| 
 | 
   689         }
 | 
| 
 | 
   690     }
 | 
| 
 | 
   691 
 | 
| 
 | 
   692     ### print data from hash into tab result file, optional with G2L-style spreadsheet formulas
 | 
| 
 | 
   693     foreach my $seq_id (sort keys %{$data_hash_ref}) {
 | 
| 
 | 
   694         foreach my $pos (sort {$data_hash_ref->{$seq_id}->{$a}->{'start'} <=> $data_hash_ref->{$seq_id}->{$b}->{'start'}} keys $data_hash_ref->{$seq_id}) { # sort each position entry in %data via start position
 | 
| 
 | 
   695             print "$seq_id";
 | 
| 
 | 
   696             my ($start, $stop) = split(/\.\./, $pos);
 | 
| 
 | 
   697             print "\t$start\t$stop";
 | 
| 
 | 
   698             print "\t$data_hash_ref->{$seq_id}->{$pos}->{'primary_tag'}"; # primary_tag should always be present
 | 
| 
 | 
   699             print_tag2tab('locus_tag', $data_hash_ref, $seq_id, $pos, 1); # subroutine; locus_tag should occur always just one time per primary tag
 | 
| 
 | 
   700             foreach (sort keys %{$tags_max_count_hash_ref}) { # print residual tags
 | 
| 
 | 
   701                 print_tag2tab($_, $data_hash_ref, $seq_id, $pos, $tags_max_count_hash_ref->{$_}) if (!/locus_tag/); # subroutine
 | 
| 
 | 
   702             }
 | 
| 
 | 
   703 
 | 
| 
 | 
   704             ### G2L-style spreadsheet formulas
 | 
| 
 | 
   705             if ($Opt_Spreadsheet) {
 | 
| 
 | 
   706                 $spread_row_count++;
 | 
| 
 | 
   707                 if ($Formula_Lang_Spreadsheet =~ /^e/i) {
 | 
| 
 | 
   708                     print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ',"0000")&"0"'; # spreadsheet column 'spreadsheet_locus_tag'
 | 
| 
 | 
   709                 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
 | 
| 
 | 
   710                     print "\t=\"$Locus_Prefix\"", '&"_"&A', "$spread_row_count&TEXT(", $spread_columns[$tags_column_count+8], $spread_row_count, ';"0000")&"0"';
 | 
| 
 | 
   711                 }
 | 
| 
 | 
   712                 print "\t=MIN(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'position'
 | 
| 
 | 
   713                 print "\t=", $spread_columns[$tags_column_count+6], $spread_row_count + 1, "-MAX(B$spread_row_count:C$spread_row_count)"; # spreadsheet column 'distance'
 | 
| 
 | 
   714                 if ($Formula_Lang_Spreadsheet =~ /^e/i) {
 | 
| 
 | 
   715                     print "\t=IF(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ",$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1,1)'; # spreadsheet column 'gene_number'
 | 
| 
 | 
   716                 } elsif ($Formula_Lang_Spreadsheet =~ /^g/i) {
 | 
| 
 | 
   717                     print "\t=WENN(", $spread_columns[$tags_column_count+9], $spread_row_count, '=', $spread_columns[$tags_column_count+9], $spread_row_count - 1, ";$spread_columns[$tags_column_count+8]", $spread_row_count - 1, '+1;1)';
 | 
| 
 | 
   718                 }
 | 
| 
 | 
   719                 print "\t$spread_contig_order"; # spreadsheet column 'contig_order'
 | 
| 
 | 
   720             }
 | 
| 
 | 
   721 
 | 
| 
 | 
   722             print "\n";
 | 
| 
 | 
   723         }
 | 
| 
 | 
   724         $spread_contig_order++; # next contig/replicon (SeqID) in tbl file
 | 
| 
 | 
   725     }
 | 
| 
 | 
   726 
 | 
| 
 | 
   727     select STDOUT;
 | 
| 
 | 
   728     close $out_file_fh;
 | 
| 
 | 
   729     return 1;
 | 
| 
 | 
   730 }
 |