| 3 | 1 #!/usr/bin/perl | 
|  | 2 | 
|  | 3 ####### | 
|  | 4 # POD # | 
|  | 5 ####### | 
|  | 6 | 
|  | 7 =pod | 
|  | 8 | 
|  | 9 =head1 NAME | 
|  | 10 | 
|  | 11 ecoli_mlst.pl                                       25-10-2011 | 
|  | 12 | 
|  | 13 =head1 SYNOPSIS | 
|  | 14 | 
|  | 15 C<perl ecoli_mlst.pl -a fas -g fasta> | 
|  | 16 | 
|  | 17 =head1 DESCRIPTION | 
|  | 18 | 
|  | 19 The script searches for multilocus sequence type (MLST) alleles in | 
|  | 20 I<E. coli> genomes according to Mark Achtman's scheme with seven | 
|  | 21 house-keeping genes (I<adk>, I<fumC>, I<gyrB>, I<icd>, I<mdh>, | 
|  | 22 I<purA>, and I<recA>) [Wirth et al., 2006]. I<NUCmer> from the | 
|  | 23 L<I<MUMmer package>|http://mummer.sourceforge.net/> is used to | 
|  | 24 compare the given allele sequences to bacterial genomes via | 
|  | 25 nucleotide alignments. | 
|  | 26 | 
|  | 27 Download the allele files (adk.fas ...) and the sequence type file | 
|  | 28 ('publicSTs.txt') from this website: | 
|  | 29                   http://mlst.warwick.ac.uk/mlst/dbs/Ecoli | 
|  | 30 | 
|  | 31 To run C<ecoli_mlst.pl> include all I<E. coli> genome files (file | 
|  | 32 extension e.g. 'fasta'), all allele sequence files (file extension | 
|  | 33 'fas') and 'publicSTs.txt' in the current working directory. The | 
|  | 34 allele profiles are parsed from the created *.coord files and written | 
|  | 35 to a result file, plus additional information from the file | 
|  | 36 'publicSTs.txt'. Also, the corresponding allele sequences (obtained | 
|  | 37 from the allele input files) are concatenated for each I<E. coli> | 
|  | 38 genome into a result multi-fasta file. Option B<-c> can be used to | 
|  | 39 initiate an alignment for this multi-fasta file with | 
|  | 40 L<I<ClustalW>|http://www.clustal.org/clustal2/> (standard alignment | 
|  | 41 parameters; has to be in the C<$PATH> or change variable | 
|  | 42 C<$clustal_call>). The alignment fasta output file can be used | 
|  | 43 directly for L<I<RAxML>|http://sco.h-its.org/exelixis/web/software/raxml/index.html>. CAREFUL the Phylip alignment format from | 
|  | 44 I<ClustalW> allows only 10 characters per strain ID. | 
|  | 45 | 
|  | 46 C<ecoli_mlst.pl> works with complete and draft genomes. However, | 
|  | 47 several genomes cannot be included in a single input file! | 
|  | 48 | 
|  | 49 Obviously, only for those genomes whose allele sequences have been | 
|  | 50 deposited in Achtman's allele database results can be obtained. If an | 
|  | 51 allele is not found in a genome it is marked by a '?' in the result | 
|  | 52 profile file and a place holder 'XXX' in the result fasta file. For | 
|  | 53 these cases a manual I<NUCmer> or I<BLASTN> might be useful to fill the | 
|  | 54 gaps and L<C<run_sub_seq.pl>|/run_sub_seq> to get the corresponding | 
|  | 55 'new' allele sequences. | 
|  | 56 | 
|  | 57 Non-NCBI fasta headers for the genome files have to have a unique ID | 
|  | 58 directly following the '>' (e.g. 'Sakai', '55989' ...). | 
|  | 59 | 
|  | 60 =head1 OPTIONS | 
|  | 61 | 
|  | 62 =head2 Mandatory options | 
|  | 63 | 
|  | 64 =over 22 | 
|  | 65 | 
|  | 66 =item B<-a>=I<str>, B<-alleles>=I<str> | 
|  | 67 | 
|  | 68 File extension of the MLST allele fasta files, e.g. 'fas' (<=> B<-g>). | 
|  | 69 | 
|  | 70 =item B<-g>=I<str>, B<-genomes>=I<str> | 
|  | 71 | 
|  | 72 File extension of the I<E. coli> genome fasta files, e.g. 'fasta' (<=> B<-a>). | 
|  | 73 | 
|  | 74 =back | 
|  | 75 | 
|  | 76 =head2 Optional options | 
|  | 77 | 
|  | 78 =over | 
|  | 79 | 
|  | 80 =item B<-h>, B<-help> | 
|  | 81 | 
|  | 82 Help (perldoc POD) | 
|  | 83 | 
|  | 84 =item B<-c>, B<-clustalw> | 
|  | 85 | 
|  | 86 Call L<I<ClustalW>|http://www.clustal.org/clustal2/> for alignment | 
|  | 87 | 
|  | 88 =back | 
|  | 89 | 
|  | 90 =head1 OUTPUT | 
|  | 91 | 
|  | 92 =over 17 | 
|  | 93 | 
|  | 94 =item F<ecoli_mlst_profile.txt> | 
|  | 95 | 
|  | 96 Tab-separated allele profiles for the I<E. coli> genomes, plus additional info from 'publicSTs.txt' | 
|  | 97 | 
|  | 98 =item F<ecoli_mlst_seq.fasta> | 
|  | 99 | 
|  | 100 Multi-fasta file of all concatenated allele sequences for each genome | 
|  | 101 | 
|  | 102 =item F<*.coord> | 
|  | 103 | 
|  | 104 Text files that contain the coordinates of the I<NUCmer> hits for each genome and allele | 
|  | 105 | 
|  | 106 =item (F<errors.txt>) | 
|  | 107 | 
|  | 108 Error file, summarizing number of not found alleles or unclear I<NUCmer> hits | 
|  | 109 | 
|  | 110 =item (F<ecoli_mlst_seq_aln.fasta>) | 
|  | 111 | 
|  | 112 Optional, L<I<ClustalW>|http://www.clustal.org/clustal2/> alignment in Phylip format | 
|  | 113 | 
|  | 114 =item (F<ecoli_mlst_seq_aln.dnd>) | 
|  | 115 | 
|  | 116 Optional, I<ClustalW> alignment guide tree | 
|  | 117 | 
|  | 118 =back | 
|  | 119 | 
|  | 120 =head1 EXAMPLE | 
|  | 121 | 
|  | 122 =over | 
|  | 123 | 
|  | 124 =item C<perl ecoli_mlst.pl -a fas -g fasta -c> | 
|  | 125 | 
|  | 126 =back | 
|  | 127 | 
|  | 128 =head1 VERSION | 
|  | 129 | 
|  | 130 0.3                                                update: 30-01-2013 | 
|  | 131 | 
|  | 132 =head1 AUTHOR | 
|  | 133 | 
|  | 134 Andreas Leimbach                                aleimba[at]gmx[dot]de | 
|  | 135 | 
|  | 136 =head1 LICENSE | 
|  | 137 | 
|  | 138 This program is free software: you can redistribute it and/or modify | 
|  | 139 it under the terms of the GNU General Public License as published by | 
|  | 140 the Free Software Foundation; either version 3 (GPLv3) of the License, | 
|  | 141 or (at your option) any later version. | 
|  | 142 | 
|  | 143 This program is distributed in the hope that it will be useful, but | 
|  | 144 WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 145 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 
|  | 146 General Public License for more details. | 
|  | 147 | 
|  | 148 You should have received a copy of the GNU General Public License | 
|  | 149 along with this program. If not, see L<http://www.gnu.org/licenses/>. | 
|  | 150 | 
|  | 151 =cut | 
|  | 152 | 
|  | 153 | 
|  | 154 ######## | 
|  | 155 # MAIN # | 
|  | 156 ######## | 
|  | 157 | 
|  | 158 use strict; | 
|  | 159 use warnings; | 
|  | 160 use Getopt::Long; # module to get options from the command line | 
|  | 161 | 
|  | 162 | 
|  | 163 ### Get the options with Getopt::Long, works also abbreviated and with two "--": -g, --g, -genomes ... | 
|  | 164 my $usage = "perldoc $0"; | 
|  | 165 die system($usage) unless @ARGV; | 
|  | 166 my $allele_ext = ''; # file extension of allele fasta files (<=> $genome_ext) | 
|  | 167 my $genome_ext = ''; # file extension of E. coli genome fasta files (<=> allele_ext) | 
|  | 168 my $clustalw = ''; # optionally, call ClustalW for alignment | 
|  | 169 my $help = ''; # run perldoc on the POD | 
|  | 170 GetOptions ('alleles=s' => \$allele_ext, 'genomes=s' => \$genome_ext, 'clustalw' => \$clustalw, 'help' => \$help); | 
|  | 171 | 
|  | 172 | 
|  | 173 ### Run perldoc on POD if no arguments or help | 
|  | 174 if (!$genome_ext || !$allele_ext) { | 
|  | 175     die system($usage); | 
|  | 176 } elsif ($help) { | 
|  | 177     die system($usage); | 
|  | 178 } | 
|  | 179 | 
|  | 180 | 
|  | 181 ### Check if result files already exist, overwrite or exit script after user question | 
|  | 182 my $ecoli_allele_seq = 'ecoli_mlst_seq.fasta'; # multi-fasta result file with concatenated allele seqs for each E. coli genome (sequences are from the MLST allele files) | 
|  | 183 my $clustal_aln = 'ecoli_mlst_seq_aln.fasta'; # fasta alignment file from optional clustalW alignment | 
|  | 184 if (-e $ecoli_allele_seq) { | 
|  | 185     print "A previous analysis exists, overwrite the old result files [y|n]? "; | 
|  | 186     my $stdin = <STDIN>; | 
|  | 187     chomp $stdin; | 
|  | 188     if ($stdin =~ /n/i) { | 
|  | 189          die "Script abborted!\n\n"; | 
|  | 190     } elsif ($stdin =~ /y/i) { | 
|  | 191          if (-e $clustal_aln) { # get rid of the optional clustalW alignment fasta-file before program run | 
|  | 192              unlink $clustal_aln; | 
|  | 193          } | 
|  | 194     } | 
|  | 195 } | 
|  | 196 | 
|  | 197 | 
|  | 198 ### The MLST alleles from the Achtman scheme | 
|  | 199 my @alleles = ('adk', 'fumC', 'gyrB', 'icd', 'mdh', 'purA', 'recA'); | 
|  | 200 | 
|  | 201 | 
|  | 202 ### Read directory and look for corresponding files | 
|  | 203 my $dirname = "."; | 
|  | 204 my @genome_files; # array to save all the fasta genome files | 
|  | 205 my %allele_files; # hash to save all the multi-fasta allele files | 
|  | 206 opendir(DIR, $dirname) or die "Can't opendir $dirname: $!\n"; | 
|  | 207 while (defined(my $file = readdir(DIR))) { # go through each file in the directory | 
|  | 208     if ($file eq 'ecoli_mlst_seq.fasta') { # don't use the result file from a previous analysis, the allele multi fasta file 'ecoli_mlst_multi.fasta' (see below)! | 
|  | 209     next; | 
|  | 210     } elsif ($file =~ /^(\S+)\.$genome_ext$/ && (-s $file < 9000000)) { # don't use files > 10 MB, E. coli fasta file shouldn't be too big (normally around 5 MB), otherwise probably wrong fasta file in folder | 
|  | 211         push (@genome_files, $file); | 
|  | 212     } | 
|  | 213     foreach my $allele (@alleles) { | 
|  | 214     if ($file =~ /^$allele\S*\.$allele_ext$/i) { | 
|  | 215         $allele_files{$allele} = $file; | 
|  | 216     } | 
|  | 217     } | 
|  | 218 } | 
|  | 219 closedir(DIR); | 
|  | 220 die "No E. coli genome fasta-files were found!\n" unless scalar @genome_files; # empty array in scalar context returns 0 | 
|  | 221 die "No allele fasta-files were found!\n" unless scalar %allele_files; # empty hash in scalar context returns 0 | 
|  | 222 | 
|  | 223 | 
|  | 224 ### Look for multi-fasta genome files and concatenate them to a single-fasta entry (e.g. draft genomes). Subsequently concatenate all E. coli genome sequence files to one multi-fasta/-genome file for the subsequent nucmer run. Additionally parse and associate the E. coli accession#s/unique IDs and descriptions (strain names ...) from the genomes | 
|  | 225 my $genome_file = 'ecoli_multi.fasta'; # multi-genome/-fasta file for the nucmer run, used as a temp file, will be deleted at the end of the script | 
|  | 226 open(GENOMES, ">$genome_file") or die "Failed to create file '$genome_file': $!\n"; | 
|  | 227 my $genome_number = 0; # used to control if lines in *.coords files correspond to the overall genome count | 
|  | 228 my %genome_desc; # hash to associate accession#/unique ID with genome descriptions | 
|  | 229 foreach my $file (@genome_files) { | 
|  | 230     $genome_number++; | 
|  | 231     open(MULTI, "<$file") or die "Failed to open E. coli genome file '$file': $!\n"; | 
|  | 232     my @multi = <MULTI>; # read the whole genome file (potential multi-fasta file) | 
|  | 233     my @IDs = grep(/^>/, @multi); # get ID lines in fasta file | 
|  | 234     if (scalar @IDs > 1) { # multi-fasta files | 
|  | 235     my $new_id; # new ID line for the multi-fasta file | 
|  | 236     foreach (@IDs) { # discard plasmid ID lines in complete genomes for new file (name with chromosome ID line) | 
|  | 237         if (!/plasmid/) { | 
|  | 238         chomp; | 
|  | 239         $new_id = $_; | 
|  | 240         last; # jump out of the loop if a non-plasmid ID found | 
|  | 241         } | 
|  | 242     } | 
|  | 243     if (!defined($new_id)) { | 
|  | 244         die "The file '$file' only contains plasmids, program exits!\n"; | 
|  | 245     } | 
|  | 246     my ($acc, $desc) = acc_desc($new_id); # subroutine to fill hash %genome_desc with acc#/unique ID and genome description | 
|  | 247     print GENOMES ">$acc $desc; artificial genome\n"; # print the now shortened ID-line for the new single-fasta entry in the result multi-genome file | 
|  | 248     foreach (@multi) { # print the rest of the new single-fasta file | 
|  | 249         if (/^>/) { # skip ID lines of the old multi-fasta file | 
|  | 250         next; | 
|  | 251         } | 
|  | 252         print GENOMES; | 
|  | 253     } | 
|  | 254     } elsif (scalar @IDs == 1) { # non-multi-fasta files | 
|  | 255     my ($acc, $desc) = acc_desc($IDs[0]); | 
|  | 256     print GENOMES ">$acc $desc\n"; # print the new shortened ID-line | 
|  | 257     foreach (@multi) { | 
|  | 258         if (/^>/) { # skip ID line, since new one already printed | 
|  | 259         next; | 
|  | 260         } | 
|  | 261         print GENOMES; | 
|  | 262     } | 
|  | 263     } else { # wrong fasta files | 
|  | 264     die "File '$file' does not include a fasta ID line, program exits!\n"; | 
|  | 265     } | 
|  | 266     print GENOMES "\n"; | 
|  | 267     close MULTI; | 
|  | 268 } | 
|  | 269 close GENOMES; | 
|  | 270 my @delete; # files to be deleted after program run | 
|  | 271 if (-e $genome_file) { | 
|  | 272     push (@delete, $genome_file); | 
|  | 273 } else { | 
|  | 274     die "The multi-fasta E. coli genome file \'$genome_file\' could not be created for the NUCmer run: $!\n"; | 
|  | 275 } | 
|  | 276 | 
|  | 277 | 
|  | 278 ### Parse file 'publicSTs.txt' to get additional info for each sequence type | 
|  | 279 my $st_file = 'publicSTs.txt'; | 
|  | 280 open(ST, "<$st_file")  or die "Failed to open sequence type file '$st_file': $!\n"; | 
|  | 281 my %seq_type; # hash to save ST info to each allele profile | 
|  | 282 my $header = <ST>; # get rid of file header | 
|  | 283 while (<ST>) { | 
|  | 284     chomp; | 
|  | 285     my @st = split (/\t/, $_); | 
|  | 286     my $profile = "$st[3] $st[4] $st[5] $st[6] $st[7] $st[8] $st[9]"; # allele profile | 
|  | 287     my $info = "$st[0]\t$st[1]\t$st[2]\t$st[10]\t$st[11]"; # associated info to allele profile | 
|  | 288     # $st[0] = ST, 1 = ST_COMPLEX, 2 = ANCESTRAL_GROUP, 3-9 = adk-recA alleles, 10 = SOURCE, 11 = REFSTRAIN | 
|  | 289     $seq_type{$profile} = $info; | 
|  | 290 } | 
|  | 291 close ST; | 
|  | 292 | 
|  | 293 | 
|  | 294 ### Run nucmer with allele sequences as REFERENCES and the created 'ecoli_multi.fasta' file as QUERY | 
|  | 295 my @created_files; # used to print out files that are created at the end of the script | 
|  | 296 my %coord_files; | 
|  | 297 foreach (keys %allele_files) { | 
|  | 298     system ("nucmer --prefix $_-all_ecoli $allele_files{$_} $genome_file"); # system call; seperate args not possible, probably because nucmer not a system command?! | 
|  | 299     system ("show-coords -lT $_-all_ecoli.delta > $_-all_ecoli.coords"); # '-l' include seq length in output, '-T' tab-separated output | 
|  | 300     if (-e "$_-all_ecoli.coords") { | 
|  | 301     $coord_files{$_} = "$_-all_ecoli.coords"; | 
|  | 302     push (@delete, "$_-all_ecoli.delta"); | 
|  | 303     push (@created_files, "\t\t\t$_-all_ecoli.coords\n"); | 
|  | 304     } else { | 
|  | 305     die "Coord file '$_-all_ecoli.coords' could not be created: $!\n"; | 
|  | 306     } | 
|  | 307 } | 
|  | 308 | 
|  | 309 | 
|  | 310 ### Parse *.coords nucmer result files for MLST alleles and corresponding accession#s | 
|  | 311 my @summary_err; # array to store error summary for error file | 
|  | 312 my @detail_err; # array to store the more detailed errors for error file | 
|  | 313 my $error = 0; # switches to '1' if an error is detected | 
|  | 314 my %strain_allele; # declare hash, that's subsequently used as 'hash in hash' in sub parse_coords to associate MLST alleles and accession#s | 
|  | 315 foreach (sort keys %coord_files) { | 
|  | 316     parse_coord($coord_files{$_}, $_); # subroutine to fill %strain_allele | 
|  | 317 } | 
|  | 318 | 
|  | 319 | 
|  | 320 ### Print the corresponding allele sequences and allele profile (plus additional info) for each E. coli genome | 
|  | 321 open(SEQ, ">$ecoli_allele_seq") or die "Failed to create file '$ecoli_allele_seq': $!\n"; | 
|  | 322 my $ecoli_allele_profile = 'ecoli_mlst_profile.txt'; # tab-separated result file for the allele profile of each E. coli genome plus additional info from file 'publicSTs.txt' from the Achtman MLST DB | 
|  | 323 open(PROFILE, ">$ecoli_allele_profile") or die "Failed to create file '$ecoli_allele_profile': $!\n"; | 
|  | 324 print PROFILE "Strain"; # header for profile file | 
|  | 325 foreach (sort @alleles) { | 
|  | 326     print PROFILE "\t$_"; | 
|  | 327 } | 
|  | 328 print PROFILE "\tST\tST_COMPLEX\tANCESTRAL_GROUP\tSOURCE\tREFSTRAIN\n"; # info from file 'publicSTs.txt' | 
|  | 329 foreach my $acc (sort {lc $genome_desc{$a} cmp lc $genome_desc{$b}} keys %genome_desc) { # call hash %genome_desc by keys (acc#s), but sort by values (genome desc.s) of the hash case-insensitively (all in lowercase, because Perl sorts lowercase after uppercase!) | 
|  | 330     print SEQ ">$genome_desc{$acc} $acc\n"; # print fasta ID line for 'ecoli_mlst_seq.fasta' | 
|  | 331     print PROFILE "$genome_desc{$acc}"; # print genome desc for strain in first column for 'ecoli_mlst_profile.txt' | 
|  | 332     my $profile = ''; # allele profile to extract info from %seq_type | 
|  | 333     foreach my $allele (sort keys %strain_allele) { | 
|  | 334     if (defined $strain_allele{$allele}->{$acc}) { | 
|  | 335         open(ALLELE, "<$allele_files{$allele}") or die "Failed to open allele file $allele_files{$allele}: $!\n"; | 
|  | 336         while (my $line = <ALLELE>) { # search for corresponding allele in multi-fasta allele file | 
|  | 337         chomp $line; | 
|  | 338         if ($line =~ /^>$strain_allele{$allele}->{$acc}$/i) { | 
|  | 339             $line = <ALLELE>; # don't need the ID line but the following seq lines | 
|  | 340             while ($line !~ /^>/) { # print sequence in result file until another '>' is found | 
|  | 341             chomp $line; | 
|  | 342             print SEQ "$line"; | 
|  | 343             $line = <ALLELE>; | 
|  | 344             } | 
|  | 345         } | 
|  | 346         } | 
|  | 347         close ALLELE; | 
|  | 348         $strain_allele{$allele}->{$acc} =~ s/^(\D+)(\d+)$/$2/; # only keep allele number | 
|  | 349         print PROFILE "\t$strain_allele{$allele}->{$acc}"; | 
|  | 350         $profile .= "$strain_allele{$allele}->{$acc} "; # concat allele numbers in $profile to extract info from %seq_type | 
|  | 351     } else { | 
|  | 352         print SEQ "XXX"; # place holder if allele of genome not determined | 
|  | 353         print PROFILE "\t?"; # place holder as well | 
|  | 354         unshift (@detail_err, "$genome_desc{$acc} didn't give a hit with $allele, marked by \'XXX\' in allele sequences and \'?\' in allele profile!\n"); | 
|  | 355         next; | 
|  | 356     } | 
|  | 357     } | 
|  | 358     print SEQ "\n\n"; # blank line in front of each ID line | 
|  | 359     chop $profile; # get rid of the last space, so it can be used as the key in %seq_type | 
|  | 360     if (defined $seq_type{$profile}) { # print ST and additional info from file 'publicST.txt' in 'ecoli_mlst_profile.txt' | 
|  | 361     print PROFILE "\t$seq_type{$profile}\n"; | 
|  | 362     } else { | 
|  | 363     print PROFILE "\t?\t?\t?\t?\t?\n"; | 
|  | 364     } | 
|  | 365 } | 
|  | 366 close SEQ; | 
|  | 367 close PROFILE; | 
|  | 368 if (-e $ecoli_allele_seq && $ecoli_allele_profile) { # push new created files in array for print out | 
|  | 369     push (@created_files, "\n\tResult files:\n\t\t\t$ecoli_allele_seq\t-> Allele sequences!\n", "\t\t\t$ecoli_allele_profile\t-> Allele profile plus info from 'publicSTs.txt'!\n"); | 
|  | 370 } else { | 
|  | 371     die "The result files $ecoli_allele_seq and $ecoli_allele_profile could not be created: $!\n"; | 
|  | 372 } | 
|  | 373 | 
|  | 374 | 
|  | 375 ### Print errors in error file, | 
|  | 376 if ($error == 1) { # error(s) occurred | 
|  | 377     my $err_file = 'errors.txt'; | 
|  | 378     open(ERR, ">$err_file") or die "Failed to create file '$err_file': $!\n"; | 
|  | 379     print ERR "Error summary:\n"; | 
|  | 380     print ERR @summary_err; # filled in sub 'parse_coords' | 
|  | 381     print ERR "\nDetailed error output:\n"; | 
|  | 382     print ERR @detail_err; | 
|  | 383     push (@created_files, "\t\t\t$err_file\t\t-> Error file!\n"); | 
|  | 384     close ERR; | 
|  | 385 } | 
|  | 386 | 
|  | 387 | 
|  | 388 ### Delete unneeded files, temp file 'ecoli_multi.fasta' (see above) and the *.delta files from the NUCmer run | 
|  | 389 foreach my $goners (@delete) { | 
|  | 390     unlink $goners or warn "Could not remove file '$goners': $!"; | 
|  | 391 } | 
|  | 392 | 
|  | 393 | 
|  | 394 ### Print newly created files | 
|  | 395 print "\n###########################################################################\n"; | 
|  | 396 print "The following files were created:\n"; | 
|  | 397 print "\tCoordinates of MLST alleles in each genome:\n"; | 
|  | 398 print @created_files; | 
|  | 399 print "\n###########################################################################\n"; | 
|  | 400 | 
|  | 401 | 
|  | 402 ### Align with ClustalW if option '-c' is given | 
|  | 403 if ($clustalw) { | 
|  | 404     print "\nStarting ClustalW alignment with file $ecoli_allele_seq ..."; | 
|  | 405     my $out = $ecoli_allele_seq; | 
|  | 406     $out =~ s/\.fasta$//; | 
|  | 407     my $clustal_call = "clustalw -infile=$ecoli_allele_seq -outfile=$clustal_aln -align -type=DNA -output=phylip -tree -newtree=$out\_aln.dnd -outputtree=phylip"; | 
|  | 408     system ($clustal_call); | 
|  | 409 } | 
|  | 410 | 
|  | 411 exit; | 
|  | 412 | 
|  | 413 | 
|  | 414 ############### | 
|  | 415 # Subroutines # | 
|  | 416 ############### | 
|  | 417 | 
|  | 418 ### Subroutine to associate the acc# to the genome description in hash %genome_desc (see above) | 
|  | 419 sub acc_desc { | 
|  | 420     my $ID = shift; | 
|  | 421     chomp $ID; | 
|  | 422     if ($ID =~ /^>gi\|\d+\|(emb|gb|dbj|ref)\|(\w+\d+\.\d)\|\s(.+)$/) { # NCBI fasta header | 
|  | 423     # e.g.: >gi|387605479|ref|NC_017626.1| Escherichia coli 042, complete genome | 
|  | 424     # $1 = DB (embl, genbank, ddbj, refseq), $2 = acc#, $3 = genome description | 
|  | 425     my $desc = shorten_desc($3); # subroutine to shorten the genome description | 
|  | 426     $genome_desc{$2} = $desc; # associate accession# with genome description | 
|  | 427     return ($2, $desc); | 
|  | 428     } elsif ($ID =~ /^>(\S+)\s*(\S*)\s*(.*)$/) { # headers after EMBOSS's union of multi-fasta files, and other headers with a unique ID after '>' | 
|  | 429     # e.g.: >NC_011748.1 NC_011748.1 Escherichia coli 55989 chromosome, complete genome | 
|  | 430     # $1 = acc#, $2 = genome desc for drafts|duplicated acc# for complete genomes, $3 = genome desc for complete genomes | 
|  | 431     my $desc = $2 . ' ' . $3; | 
|  | 432     if ($1 eq $2) { # complete genomes have acc# double after EMBOSS's union (see above) | 
|  | 433         $desc = $3; | 
|  | 434     } | 
|  | 435     $desc = shorten_desc($desc); | 
|  | 436     $genome_desc{$1} = $desc; | 
|  | 437     return ($1, $desc); | 
|  | 438     } | 
|  | 439     return 0; | 
|  | 440 } | 
|  | 441 | 
|  | 442 | 
|  | 443 ### Subroutine to parse *.coord nucmer result files for MLST alleles and corresponding accession#s | 
|  | 444 sub parse_coord { | 
|  | 445     my ($coord_file, $allele) = @_; | 
|  | 446     my $lines = 0; # lines of respective *.coords file (without header); control if hit is missing or insensitive hits are present in comparison to $genome_number for the error file | 
|  | 447     my $discarded = 0; # number of discarded hits/lines in coord file for error file | 
|  | 448     open (COORD, "$coord_file") or die "Failed to open file '$coord_file': $!\n"; | 
|  | 449     while (<COORD>) { | 
|  | 450         chomp; | 
|  | 451         if ( /\d+\t\d+\t\d+\t\d+\t(\d+)\t(\d+)\t(\d+\.\d+)\t(\d+)\t\d+\t(\D+\d+)\t(\w*\d+\.\d)$/ ) { | 
|  | 452         # since the ID lines of the fastas were shortened in temp file 'ecoli_multi.fasta', the acc#s should be the last element of each line in the coords file | 
|  | 453         # $1 = length of ref alignment, $2 = length of query alignment, $3 = identity percentage, | 
|  | 454         # $4 = length of ref seq, $5 = allele-nr. (e.g. ADK15), $6 = accession-nr. | 
|  | 455         $lines++; # after 'if' to skip headers | 
|  | 456             if ($1 != $4) { # write error to @detail_err for error file | 
|  | 457         push (@detail_err, "$genome_desc{$6}, $5: Reference (allele) alignment doesn't have a correct length, therefore allele is not included in output!\n"); | 
|  | 458         $discarded++; | 
|  | 459         next; | 
|  | 460             } elsif ($2 != $4) { | 
|  | 461         push (@detail_err, "$genome_desc{$6}, $5: Query (genome) alignment doesn't have a correct length, therefore allele is not included in output!\n"); | 
|  | 462         $discarded++; | 
|  | 463         next; | 
|  | 464             } elsif ($3 != '100.00') { | 
|  | 465         push (@detail_err, "$genome_desc{$6}, $5: Identity is not 100\%, therefore is not included in output!\n"); | 
|  | 466         $discarded++; | 
|  | 467         next; | 
|  | 468             } | 
|  | 469         $strain_allele{$allele}{$6} = $5; # associate allele# with acc# and store as hash in hash in %strain_allele | 
|  | 470     } | 
|  | 471     } | 
|  | 472     close COORD; | 
|  | 473     # error identifications for later print out in error.txt | 
|  | 474     if ($discarded == 0) { | 
|  | 475     if ($lines < $genome_number) { | 
|  | 476         $error = 1; # switch $error from 0 to 1 to indicate error was found | 
|  | 477         push (@summary_err, $genome_number - $lines, " $allele allele(s) missing!\n"); | 
|  | 478     } | 
|  | 479     } elsif ($discarded > 0) { | 
|  | 480     $error = 1; | 
|  | 481     if (($lines - $discarded) == $genome_number) { | 
|  | 482         push (@summary_err, "Total number of specific assigned $allele alleles is correct, but $discarded non-specific hit(s) discarded!\n"); | 
|  | 483     } elsif (($lines - $discarded) < $genome_number) { | 
|  | 484         push (@summary_err, $genome_number - ($lines - $discarded), " $allele allele(s) missing and $discarded non-specific hit(s)!\n"); | 
|  | 485     } | 
|  | 486     } | 
|  | 487     return 1; | 
|  | 488 } | 
|  | 489 | 
|  | 490 | 
|  | 491 ### Shorten the genome descriptions of the ID headers | 
|  | 492 sub shorten_desc { | 
|  | 493     my $desc = shift; | 
|  | 494     $desc =~ s/Escherichia coli/Ecoli/; | 
|  | 495     $desc =~ s/\'//g; | 
|  | 496     $desc =~ s/( DNA, complete genome| chromosome, complete genome|, complete genome| chromosome, complete sequence| complete genome|, complete sequence|, strain (\S+)|, whole genome shotgun sequence)$//; | 
|  | 497     $desc =~ s/\s/_/g; | 
|  | 498     return $desc; | 
|  | 499 } |