comparison getdata/genbankstrip.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 #
3 # Modified for Osiris by THO.
4 #
5 #
6 # GenBankStrip.pl v2.0
7 # Last modified July 25, 2005 14:29
8 # (c) Olaf R.P. Bininda-Emonds
9 #
10 # Input:
11 # 1) A GenBank output file
12 # 2) A file containing gene names, each on a separate line
13 #
14 # Generates:
15 # 1) A Se-Al formatted (and optionally a nexus-formatted) datafile containing stripped sequences
16 # 2) A summary file giving status of each entry in GenBank file
17 #
18 # Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]
19 # options: -f<filename> = file containing sequences in GenBank format
20 # -g<filename> = file containing specific genes to be stripped
21 # -i<genename> = strip a single gene
22 # -k<number> = number of (longest) sequences to retain per species for a given gene (default = all)
23 # -l<number> = minimum length required for all non-tRNA genes (default = none)
24 # -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format
25 # -s = only process sequences with valid species names (i.e., no species with sp. or cf. in names)
26 # -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0) a gene must have to be retained; both can be specified simultaneously
27 # -h = print this message and quit
28 # -v = verbose output
29 # -a = Accession number appears first in stripped Fasta File -- Added by THO
30 use strict;
31
32 # Set default values
33 # Genes and sequences
34 my %synonym;
35 my %complement = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A',
36 'M' => 'K', 'R' => 'Y', 'W' => 'S', 'S' => 'W', 'Y' => 'R', 'K' => 'M',
37 'B' => 'V', 'D' => 'H', 'H' => 'D', 'V' => 'B', 'N' => 'N', '-' => '-', '?' => '?');
38 my %maxLength;
39
40 # I/O
41 my $gbFile;
42 my ($countFile, $rejectFile, $stripFile);
43 my $geneFile;
44 my %userGene;
45 my $singleGene;
46 my $keepGene = 0;
47 my $minLength;
48 my $minLengthTRNA;
49 my $speciesBlock = 0;
50 my $seqThreshold = 0;
51 my $speciesThreshold = 0;
52 my $sealPrint = 0;
53 my $nexusPrint = 0;
54 my $phytabPrint = 0;
55
56 # Global stats
57 my (@globalGeneList, %globalGenePresent, %geneStatus, @rejectList, $stripCount);
58 my (@speciesList, %speciesPresent, %speciesGenePresent, %quickSpeciesCount, %speciesCount, %quickSequenceCount, %sequenceCount);
59 $sequenceCount{"all"} = 0;
60
61 # Miscellaneous
62 my $debug = 0;
63 my $verbose = 0;
64 my $accessionFirst = 0; #Added by THO as option to have accession # first in FASTA output
65
66 for (my $i = 0; $i <= $#ARGV; $i++)
67 {
68 #Original stripped out directory structure, required by Galaxy
69 # if ($ARGV[$i] =~ /^-f([\w+\.?\w+?]*)/)
70 if ($ARGV[$i] =~ /^-f(.+)/)
71 {
72 $gbFile = $1;
73
74 # (my $baseFile = $gbFile) =~ s/.\w+$//;
75 # $countFile .= $baseFile . "_geneCount.txt";
76 # $rejectFile .= $baseFile . "_gbs.rejectlist.txt";
77 # $stripFile .= $baseFile . "_gbs.striplist.txt";
78 # }
79 #for Galaxy, easier to have same file name each time it'r run
80 (my $baseFile = $gbFile) =~ s/.\w+$//;
81 $countFile .= "geneCount.txt";
82 $rejectFile .= "rejectlist.txt";
83 $stripFile .= "striplist.txt";
84 }
85 elsif ($ARGV[$i] =~ /^-g([\w+\.?\w+?]*)/)
86 {
87 $geneFile = $1;
88
89 #This is a hack to write the name of a single gene into a new genefile based on the Galaxy call
90
91 open (GENE,">$geneFile") or die "Cannot open file $geneFile containing gene names.\n";
92 print GENE "$geneFile\n";
93 close GENE;
94 }
95 elsif ($ARGV[$i] =~ /^-i([\w+\.?\w+?]*)/)
96 {
97 $singleGene = $1;
98 }
99 elsif ($ARGV[$i] =~ /^-k(\d+)/)
100 {
101 $keepGene = $1;
102 }
103 elsif ($ARGV[$i] =~ /^-l(\d+)/)
104 {
105 $minLength = $1;
106 $minLengthTRNA = 50;
107 }
108 elsif ($ARGV[$i] =~ /^-on/)
109 {
110 $nexusPrint = 1;
111 }
112 elsif ($ARGV[$i] =~ /^-op/)
113 {
114 $phytabPrint = 1;
115 }
116 elsif ($ARGV[$i] =~ /^-os/)
117 {
118 $sealPrint = 1;
119 }
120 elsif ($ARGV[$i] =~ /^-s/)
121 {
122 $speciesBlock = 1;
123 }
124 elsif ($ARGV[$i] =~ /^-tg(\d+)/)
125 {
126 $seqThreshold = $1;
127 }
128 elsif ($ARGV[$i] =~ /^-ts(\d+)/)
129 {
130 $speciesThreshold = $1;
131 }
132 elsif ($ARGV[$i] =~ /^-v/)
133 {
134 $verbose = 1;
135 }
136 elsif ($ARGV[$i] =~ /^-a/)
137 {
138 $accessionFirst = 1;
139 }
140 elsif ($ARGV[$i] =~ /^-x/)
141 {
142 $debug = 1;
143 $verbose = 1;
144 }
145 elsif ($ARGV[$i] =~ /^-h/)
146 {
147 print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n";
148 print "Options: -f<filename> = file containing sequences in GenBank format\n";
149 print " -g<filename> = file containing specific genes to be stripped\n";
150 print " -k<number> = number of (longest) sequences to retain per species for a given gene (default = all)\n";
151 print " -l<number> = minimum length required for all non-tRNA genes (default = none)\n";
152 print " -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format\n";
153 print " -s = only process sequences for valid species names (i.e., no species with sp. or cf. in names)\n";
154 print " -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0)\n";
155 print " a gene must have to be retained; both can be specified simultaneously\n";
156 print " -h = print this message and quit\n";
157 print " -v = verbose output\n";
158 exit(0);
159 }
160 else
161 {
162 print "Don't understand argument: $ARGV[$i]\n";
163 print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n";
164 exit(1);
165 }
166 }
167
168 die "ERROR: Must supply name of GenBank output file using flag -f.\n" if (not $gbFile);
169
170 # Load in hardwired gene synonyms
171 geneSynonyms();
172
173 # Get list of target genes (if desired)
174 if ($geneFile)
175 {
176 my $userGeneCount = 0;
177 my %userGenePresent;
178 setLineBreak($geneFile);
179 open (GENE,"<$geneFile") or die "Cannot open file $geneFile containing gene names.\n";
180 print "Gene(s) to be stripped:\n";
181 while (<GENE>)
182 {
183 chomp;
184 next unless ($_);
185 my $gene = $_;
186 $gene = $synonym{$gene} if (defined $synonym{$gene});
187 $userGene{$gene} = 1;
188 unless ($userGenePresent{$gene})
189 {
190 $userGeneCount++;
191 $userGenePresent{$gene}++;
192 print "\t$gene\n";
193 }
194 }
195 close GENE;
196
197 # die "ERROR: No genes read in from file $geneFile\n";
198 }
199 #THO added -h command to easilly pass a single gene for Galaxy
200 if($singleGene)
201 {
202 my $userGeneCount = 1;
203 my %userGenePresent;
204 print "Gene(s) to be stripped:\n";
205 my $gene = $singleGene;
206 $gene = $synonym{$gene} if (defined $synonym{$gene});
207 print "\t$gene\n";
208 $geneFile = $singleGene;
209 }
210
211 # Print parameter summary
212 print "The following parameters have been set by the user:\n";
213 print "\tFile containing GenBank sequence data: $gbFile\n";
214 print "\tFile containing target genes to be stripped: $geneFile\n" if ($geneFile);
215 print "\tUser-defined constraints(s):\n";
216 print "\t\tNumber of sequences: $seqThreshold\n" if ($seqThreshold);
217 print "\t\tNumber of species: $speciesThreshold\n" if ($speciesThreshold);
218 print "\t\tMinimum sequence length: global - $minLength bp; tRNAs - $minLengthTRNA bp\n" if (defined $minLength);
219 print "\t\tOnly using species with valid names\n" if ($speciesBlock);
220 print "\t\tNone\n" if (not $seqThreshold and not $speciesThreshold and not defined $minLength and not $speciesBlock);
221 print "\tNumber of sequences to keep per species for each gene: ";
222 if ($keepGene)
223 {
224 print "$keepGene\n";
225 }
226 else
227 {
228 print "all\n";
229 }
230 print "\tOutput format(s): fasta";
231 print ", Se-Al" if ($sealPrint);
232 print ", nexus" if ($nexusPrint);
233 print ", phytab" if ($phytabPrint);
234 print "\n";
235
236 # Do quick gene count if thresholds indicated; takes longer but 1) saves many disk operations and 2) less memory intensive
237 geneCount($gbFile) if (not defined $geneFile and ($seqThreshold or $speciesThreshold)); # Don't bother if user gene list given; will usually be small enough so that benefits won't come into play
238
239 # Read in GenBank file and strip genes
240 my $stripZero = time;
241 print "\nProcessing GenBank file $gbFile ...\n";
242 setLineBreak($gbFile);
243 open (DATA, "<$gbFile") or die "Cannot open GenBank output file $gbFile\n";
244 my @allAccNum;
245 my ($accNum, %accRead, $duplEntry, $organism, %species, $geneName);
246 my $speciesFlag = 0;
247 my (%genePresent, @geneList);
248 my (@startList, @stopList, $complementFlag, $typeStatus, %pseudoStatus, %seqType, $joinLine, @joinSegments, %geneStart, %geneStop, %compStatus, $fullSeq);
249 my $nameFlag = 0;
250 my $joinFlag = 0;
251 my $pseudoFlag = 0;
252 my $readSeqs = 0;
253
254 while (<DATA>)
255 {
256 chomp;
257 my $readLine = $_;
258 next if ($readLine =~ /^\s*LOCUS/ or $readLine =~ /^\s*DEFINITION/ or $readLine =~ /^\s*VERSION/ or $readLine =~ /^\s*KEYWORDS/ or $readLine =~ /^\s*SOURCE/ or $readLine =~ /^\s*ORGANISM/ or $readLine =~ /^\s*REFERENCE/ or $readLine =~ /^\s*AUTHORS/ or $readLine =~ /^\s*TITLE/ or $readLine =~ /^\s*JOURNAL/ or $readLine =~ /^\s*MEDLINE/ or $readLine =~ /^\s*PUBMED/ or $readLine =~ /^\s*FEATURES/ or $readLine =~ /^\s*COMMENT/ or $readLine =~ /^\s*BASE COUNT/ or $readLine =~ /^\s*source/ or $readLine =~ /^\s*\/codon/ or $readLine =~ /^\s*\/transl/ or $readLine =~ /^\s*\/db_/ or $readLine =~ /^\s*CONTIG/);
259
260 # Get accession number
261 if ($readLine =~ /^\s*ACCESSION\s+(.+)/)
262 {
263 $accNum = $1;
264 print "$readLine\n" if ($debug);
265 # Clear variables
266 undef @geneList;
267 undef %genePresent;
268 undef $fullSeq;
269 undef @startList;
270 undef @stopList;
271 undef %geneStart;
272 undef %geneStop;
273 undef %pseudoStatus;
274 undef $geneName;
275 $speciesFlag = $nameFlag = $joinFlag = $pseudoFlag = $readSeqs = $duplEntry = 0;
276
277 if (not $accRead{$accNum}) # Check for duplicate entries
278 {
279 $accRead{$accNum} = 1;
280 push @allAccNum, $accNum;
281 if (scalar(@allAccNum) == int(scalar(@allAccNum)/10000)*10000 and $verbose)
282 {
283 print "\tSequences read in: ".scalar(@allAccNum)."\n";
284 }
285 print "\tAccession number: $accNum\n" if ($debug);
286 }
287 else
288 {
289 print "*****NOTE -- DUPLICATE ENTRY Accession $accNum\n";
290 $duplEntry = 1;
291 }
292 }
293
294 # Get organism name
295 if ($readLine =~ /^\s*\/organism=\"(.+)\"/)
296 {
297 $organism = $1;
298 $organism =~ s/\s/_/g;
299 print "$readLine\n" if ($debug);
300 $species{$accNum} = $organism;
301 $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./ or $organism =~ /_X_/i);
302 if ($debug)
303 {
304 print "\t\tOrganism: $organism";
305 print " (blocked)" if ($speciesFlag and $speciesBlock);
306 print "\n";
307 }
308 }
309 next if ($speciesFlag and $speciesBlock); # Entry pertains to invalid species name; skip parsing rest of entry
310
311
312 # Get gene boundaries; process previous set of CDs
313 if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/ or $joinFlag == 1 or $readLine =~ /^\s*ORIGIN/) # ORIGIN will process last set of CDs
314 {
315 next if ($readLine =~ /^\s+\/\w+/); # Prevents spurious matches with lines beginning with "/feature"
316 $readSeqs = 1 if ($readLine =~ /^\s*ORIGIN/); # Indicates that remaining lines will contain sequence information
317
318 # Process previous gene; need to do here to account for a posteriori declarations of pseudogene status
319 if ($geneName and $nameFlag == 2 and @startList and @stopList) # Process complete gene
320 {
321 print "\t\t\t\tParsed name: $geneName (type: $typeStatus)\n" if ($debug);
322
323 # Clean up gene name and misleading punctuation
324 $geneName = geneClean($geneName);
325 $pseudoStatus{$geneName} = 1 if ($pseudoFlag);
326
327 if (defined @ { $geneStart{$geneName} } and ((defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or $typeStatus =~ /intron/i or $typeStatus =~ /UTR/i)) # Gene has previously stored CDs that might not have been recognized as a pseudogene or non-coding region
328 {
329 print "\t\t\t\t\tSubsequent occurrence of now recognized pseudogene or non-coding region; comparing new and stored CDs\n" if ($debug);
330 for (my $i = 0; $i < scalar(@startList); $i++) # Check each occurrence in new CDs for matches in stored ones
331 {
332 my $newStart = $startList[$i];
333 my $newStop = $stopList[$i];
334 print "\t\t\t\t\t\tChecking new CDs $newStart to $newStop\n" if ($debug);
335 for (my $j = 0; $j < scalar(@ { $geneStart{$geneName} }); $j++)
336 {
337 if ($newStart == $geneStart{$geneName}[$j] and $newStop == $geneStop{$geneName}[$j])
338 {
339 print "\t\t\t\t\t\t\tMatch with stored CDs (no. $j); deleted\n" if ($debug);
340 splice(@ { $geneStart{$geneName} }, $j, 1);
341 splice(@ { $geneStop{$geneName} }, $j, 1);
342 }
343 }
344 }
345 if ($debug)
346 {
347 print "\n\t\t\t\t\tCurrent gene boundaries after pseudogene / non-coding check (type = $seqType{$geneName}):\n";
348 if (scalar(@ { $geneStart{$geneName} }) < 1)
349 {
350 print "\t\t\t\t\t\tnone\n";
351 }
352 else
353 {
354 for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++)
355 {
356 print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n";
357 }
358 }
359 }
360 }
361
362 # Only process coding regions of user-desired genes (if applicable), genes with sensible CDs, non-blocked genes, and genes that are not obvious singletons or pseudogenes
363 unless (($geneFile and not defined $userGene{$geneName}) or
364 (defined $geneStatus{$geneName} and $geneStatus{$geneName} eq "rejected") or
365 (defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or
366 $geneName =~ /hypothetical/i or $geneName =~ /open reading frame/i or $geneName =~ /similar/i or $geneName =~ /homolog/i or $geneName =~ /putative/i or $geneName =~ /unknown/i or $geneName =~ /unnamed/i or $geneName =~ /\d+rik/i or $geneName =~ /possible/i or $geneName =~ /pseudo/i
367 or $typeStatus =~ /UTR/i or $typeStatus =~ /intron/i or $typeStatus =~ /misc_feature/i)
368 {
369 if (not defined $genePresent{$geneName}) # Process first occurrence of gene in entry
370 {
371 if ($debug)
372 {
373 print "\t\t\t\t\tFirst occurrence of $geneName\n" if ($debug);
374 for (my $i = 0; $i < scalar(@startList); $i++)
375 {
376 print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n";
377 }
378 }
379 $genePresent{$geneName} = 1;
380 push @geneList, $geneName;
381 push @ { $geneStart{$geneName} }, $_ foreach (@startList);
382 push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
383
384 $seqType{$geneName} = $typeStatus;
385 $compStatus{$geneName} = 0; # Note whether gene is complemented or not
386 $compStatus{$geneName} = 1 if ($complementFlag);
387 }
388 else # Attempt to add secondary occurrences
389 {
390 my $storedSegments = scalar(@ { $geneStop{$geneName} }) - 1;
391 my $lastStop = $geneStop{$geneName}[$storedSegments];
392 $lastStop = 0 if (not defined $lastStop);
393 my $newStart = $startList[0];
394
395 if ($debug)
396 {
397 print "\t\t\t\t\tSecondary occurrence of $geneName with boundaries\n" if ($debug);
398 for (my $i = 0; $i < scalar(@startList); $i++)
399 {
400 print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n";
401 }
402 }
403 if ($seqType{$geneName} eq "gene" and $typeStatus ne "gene") # New information probably more precise and better accounts for structure
404 {
405 print "\n\t\t\t\t\t\tNew segment more precisely defined by type; replaced\n" if ($debug);
406 undef @ { $geneStart{$geneName} };
407 undef @ { $geneStop{$geneName} };
408 push @ { $geneStart{$geneName} }, $_ foreach (@startList);
409 push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
410 $seqType{$geneName} = $typeStatus;
411 }
412
413 elsif ($newStart > $lastStop) # New segment occurs distal to last stored segment (could also be contiguous); append to boundaries
414 {
415 print "\n\t\t\t\t\t\tContiguous with or occurs after last stored segment; appended\n" if ($debug);
416 push @ { $geneStart{$geneName} }, $_ foreach (@startList);
417 push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
418 $seqType{$geneName} = "composite";
419 }
420 elsif (scalar(@ { $geneStart{$geneName} }) == 1 and scalar(@startList) > 1) # Replace single stored segment with new segments derived from join statement; probably better accounts for intron/exon structure
421 {
422 print "\n\t\t\t\t\t\tNew segments subdivide single stored segment; replaced\n" if ($debug);
423 undef @ { $geneStart{$geneName} };
424 undef @ { $geneStop{$geneName} };
425 push @ { $geneStart{$geneName} }, $_ foreach (@startList);
426 push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
427 $seqType{$geneName} = "composite";
428 }
429 else
430 {
431 print "\n\t\t\t\t\t\tNew segment overlaps stored segments; rejected\n" if ($debug);
432 }
433 }
434 if ($debug)
435 {
436 print "\n\t\t\t\t\tCurrent gene boundaries after CDS processing (type = $seqType{$geneName}):\n";
437 if (scalar(@ { $geneStart{$geneName} }) < 1)
438 {
439 print "\t\t\t\t\t\tnone\n";
440 }
441 else
442 {
443 for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++)
444 {
445 print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n";
446 }
447 }
448 }
449 }
450
451 # Clear entries for gene
452 $nameFlag = $complementFlag = 0;
453 undef @startList; # Prevents double listing of genes if CDS use both /gene and /product tags
454 undef @stopList;
455 undef $typeStatus;
456 $pseudoStatus{$geneName} = 0; # Clear pseudogene status for previous set of CDs
457 }
458 next if ($readLine =~ /^\s*ORIGIN/); # No more CDs to worry about
459
460 print "$readLine\n" if ($debug);
461
462 # Reset gene name and information for current set of CDs
463 undef $geneName;
464 $nameFlag = $pseudoFlag = $complementFlag = 0;
465 $complementFlag = 1 if ($readLine =~ /complement/); #DISABLED BY THO BECAUSE ONLY COMPLEMENTS AND DOESN'T REVERSE
466
467 # Get new CD information
468 if ($readLine =~ /^\s+\S+\s+join\(/i or $readLine =~ /^\s+\S+\s+order\(/)
469 {
470 $joinFlag = 1;
471 undef @startList;
472 undef @stopList;
473 undef $joinLine;
474 }
475 if ($readLine =~ /^\s+(\S+)\s+/i)
476 {
477 $typeStatus = $1;
478 }
479
480 if ($joinFlag)
481 {
482 $joinLine .= $readLine;
483 if ($readLine =~ /\)$/) # Have accounted for multiline join statements; process
484 {
485 $joinFlag = 0;
486 $complementFlag = 1 if ($joinLine =~ /complement/);
487
488 # Clean up join statement
489 $joinLine =~ s/complement\(//;
490 $joinLine =~ s/>//g;
491 $joinLine =~ s/<//g;
492 $joinLine =~ s/^\s+\S+\s+join\(//;
493 $joinLine =~ s/^\s+\S+\s+order\(//;
494 $joinLine =~ s/\)+$//;
495 $joinLine =~ s/\s+//g;
496 print "\t\tJoin boundaries: $joinLine\n" if ($debug);
497 if ($joinLine =~ /[A-Z]+\d+/i) # Avoid join commands referring to separate accessions
498 {
499 undef $joinLine;
500 print "\t\t\tERROR: Joining accessions; skipping\n" if ($debug);
501 next;
502 }
503
504 # Derive pieces of join statement
505 @joinSegments = split(/,/, $joinLine);
506 foreach my $segment (@joinSegments)
507 {
508 my ($start, $stop) = split(/\.\./, $segment);
509 # Check safeties
510 next unless ($start and $stop);
511 next if ($start =~ /\D/ or $stop =~ /\D/);
512 next if ($start > $stop);
513 push @startList, $start;
514 push @stopList, $stop;
515 }
516
517 undef $joinLine;
518 $typeStatus = "join";
519 }
520 }
521 else
522 {
523 my ($start, $stop);
524 if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/)
525 {
526 $start = $1;
527 $stop = $2;
528 }
529 next unless ($start and $stop);
530 next if ($start > $stop);
531 undef @startList;
532 push @startList, $start;
533 undef @stopList;
534 push @stopList, $stop;
535 print "\t\tParsed boundaries: $1\t$2\n" if ($debug);
536 }
537 }
538
539
540 # Check for pseudogene status
541 if ($readLine =~ /\s+\/pseudo/ or $readLine =~ /pseudogene/ or $readLine =~ /putative/)
542 {
543 print "$readLine\n" if ($debug);
544 $pseudoStatus{$geneName} = 1 if (defined $geneName);
545 $pseudoFlag = 1;
546 print "\t\t\t\tWARNING: suspected pseudogene or of uncertain (\"putative\") status\n" if ($debug);
547 }
548
549 if ($readLine =~ /codon recognized/ and defined $geneName and $geneName =~ /trna/i)
550 {
551 print "$readLine\n" if ($debug);
552 $pseudoStatus{$geneName} = 1;
553 $pseudoFlag = 1;
554 print "\t\t\t\tWARNING: tRNA for an alternative codon; ignored\n" if ($debug);
555 }
556
557 # Get gene name
558 if ($readLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/ and $nameFlag == 0) # Get start of gene name
559 {
560 $geneName = $1 if ($readLine =~ /=\"(.+)/);
561 print "$readLine\n" if ($debug);
562
563 # Check whether name wraps onto a new line
564 if (substr($readLine, -1) ne "\"")
565 {
566 $nameFlag = 1;
567 next;
568 }
569 else
570 {
571 $nameFlag = 2;
572 $geneName =~ s/\"$// if ($geneName =~ /\"$/);
573 }
574 }
575
576 if ($nameFlag == 1) # Get continuation / end of gene name
577 {
578 print "$readLine\n" if ($debug);
579 $geneName .= $readLine;
580 $nameFlag = 2 if ($readLine =~ /\"$/) # Gene name is complete
581 }
582
583 # Read in sequence information and append
584 if ($readLine =~ /^\s*\d*1\s\w+/ and @geneList and $readSeqs)
585 {
586 my $seqFrag = $readLine;
587 $seqFrag =~ s/\s+//g;
588 $seqFrag =~ s/\d+//g;
589 $fullSeq .= uc($seqFrag);
590 }
591
592 # End of entry; process
593 if ($readLine =~ /\/\//)
594 {
595 next if ($duplEntry);
596 next unless (@geneList and defined $fullSeq);
597 foreach my $gene (@geneList)
598 {
599 # Safeties; shouldn't come into play
600 next if ($geneFile and not defined $userGene{$gene});
601 next if (defined $geneStatus{$gene} and $geneStatus{$gene} eq "rejected");
602 next if (defined $pseudoStatus{$gene} and $pseudoStatus{$gene} == 1);
603
604 print "\n\t\tProcessing gene $gene\n" if ($debug);
605
606 # Strip gene out of full sequence and format as fasta
607 next unless (@ { $geneStart{$gene} } and @ { $geneStop{$gene} } );
608
609 my $geneSeq;
610 for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++)
611 {
612 print "\t\t\tGetting sequence between positions $geneStart{$gene}[$i] and $geneStop{$gene}[$i]\n" if ($debug);
613 next if ($geneStart{$gene}[$i] > length($fullSeq) or $geneStop{$gene}[$i] > length($fullSeq));
614 my $geneSegment = substr($fullSeq,$geneStart{$gene}[$i]-1,$geneStop{$gene}[$i]-$geneStart{$gene}[$i]+1);
615 print "\t\t\t\t$geneSegment\n" if ($debug);
616 $geneSeq .= $geneSegment;
617 }
618
619 next unless ($geneSeq);
620 $geneSeq = complement($geneSeq) if ($compStatus{$gene});
621 $maxLength{$gene} = length($geneSeq) if (not defined $maxLength{$gene} or length($geneSeq) > $maxLength{$gene});
622
623 # Check if sequence matches length threshold (if appropriate)
624 if (defined $minLength)
625 {
626 if ($gene =~ /^trna-\w\w\w/)
627 {
628 if (length($geneSeq) < $minLengthTRNA)
629 {
630 printf "\t\t\tRejected: length (%s bp) does not meet tRNA length threshold ($minLengthTRNA bp)\n", length($geneSeq) if ($debug);
631 next;
632 }
633 }
634 else
635 {
636 if (length($geneSeq) < $minLength)
637 {
638 printf "\t\t\tRejected: length (%s bp) does not meet global threshold ($minLength bp)\n", length($geneSeq) if ($debug);
639 next;
640 }
641 }
642 }
643 if ($gene =~ /trna-\w\w\w/ and length($geneSeq) > 100)
644 {
645 printf "\t\t\tRejected: length of tRNA too long (%s bp); indicates parsing failure\n", length($geneSeq) if ($debug);
646 next;
647 }
648
649 my $breakPoint = 79;
650 until ($breakPoint > length($geneSeq))
651 {
652 my $replaceString = "\n" . substr($geneSeq, $breakPoint, 1);
653 substr($geneSeq, $breakPoint, 1) = $replaceString;
654 $breakPoint += 80;
655 }
656
657 # Append sequence to file
658 $geneStatus{$gene} = "stripped";
659 $sequenceCount{$gene}++;
660 $sequenceCount{$species{$accNum}}++;
661 $sequenceCount{"all"}++;
662 my $fastaFile = $gene."_gbs.fasta.txt";
663
664 my $IOicon = ">";
665 $IOicon .= ">" if ($sequenceCount{$gene} > 1);
666
667 unless ($debug)
668
669
670 {
671 open (GENE, $IOicon, $fastaFile) or die "Cannot open file $fastaFile to write sequence to.";
672 print GENE ">$species{$accNum} $accNum\n";
673 print GENE "$geneSeq\n";
674 close GENE;
675 }
676
677 # Update various statistical counters
678 unless ($globalGenePresent{$gene}) # Gene counter
679 {
680 push @globalGeneList, $gene;
681 $globalGenePresent{$gene} = 1;
682 }
683
684 unless ($speciesPresent{$species{$accNum}}) # Species counter
685 {
686 push @speciesList, $species{$accNum};
687 $speciesPresent{$species{$accNum}} = 1;
688 }
689
690 $speciesGenePresent{$gene}{$species{$accNum}}++; # Species-gene counter
691 $speciesCount{$gene}++ if ($speciesGenePresent{$gene}{$species{$accNum}} == 1);
692
693 # Print out summary stats for gene
694 if ($debug)
695 {
696 print "\t\t\tGene: $gene\n";
697 printf "\t\t\t\tGene boundaries (%s bp)", length($geneSeq) - ($geneSeq =~ tr/\n//);
698 print " (complemented)" if ($compStatus{$gene});
699 print ":\n";
700 for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++)
701 {
702 print "\t\t\t\t\t$geneStart{$gene}[$i]\t$geneStop{$gene}[$i]\n";
703 }
704 print "\t\t\t\t\t$geneSeq\n";
705 }
706 }
707 }
708 }
709 close DATA;
710
711 # Print out summary stats
712 print "\n\tSummary stats:\n";
713 printf "\t\tTotal number of accessions processed: %s\n", scalar(@allAccNum);
714 printf "\t\tTotal number of unique species: %s\n", scalar(@speciesList);
715 printf "\t\tTotal number of unique sequences: %s\n", $sequenceCount{"all"};
716 printf "\t\tTotal number of unique genes: %s\n", scalar(@globalGeneList);
717
718 my $stripTime = time - $stripZero;
719 print "\n\t\tTime taken to process file: $stripTime seconds\n";
720
721 if (not @globalGeneList)
722 {
723 print "\nNOTE: No genes stripped from file $gbFile; exiting\n";
724 exit(0);
725 }
726
727 # Reprocess results to 1) recheck whether stripped genes actually fall below user-set thresholds (geneCount procedure is a rough maximum), 2) pare down to required number of sequences per species, and 3) produce appropriate output files
728 print "\nPostprocessing results ...\n";
729 my $postZero = time;
730
731 @globalGeneList = sort(@globalGeneList);
732 $stripCount = scalar(@globalGeneList);
733
734 foreach my $gene (@globalGeneList)
735 {
736 next if ($debug);
737 print "\tProcessing gene $gene ($sequenceCount{$gene} sequences for $speciesCount{$gene} species)\n" if ($verbose);
738 if ($sequenceCount{$gene} < $seqThreshold or $speciesCount{$gene} < $speciesThreshold) # Gene does not meet threshold requirements; reject
739 {
740 $geneStatus{$gene} = "rejected";
741 push @rejectList, $gene;
742 $stripCount--;
743 print "\t\tRejected; does not meet threshold requirements\n" if ($verbose);
744
745 # Remove associated file
746 my $rmString = $gene."_gbs.fasta.txt";
747 $rmString =~ s/\s/\\ /g;
748 $rmString =~ s/\(/\\(/g;
749 $rmString =~ s/\)/\\)/g;
750 $rmString =~ s/\'/\\'/g;
751 system ("rm $rmString");
752 }
753 else
754 {
755 my (%infLength, %lengthList, %criticalLength);
756 undef @speciesList;
757 $geneStatus{$gene} = "stripped";
758 if ($verbose)
759 {
760 print "\t\tStripped";
761 print "; meets threshold requirements" if ($seqThreshold or $speciesThreshold);
762 print "\n";
763 }
764
765 # Reload sequences from disk
766 my $inputFile = $gene."_gbs.fasta.txt";
767 setLineBreak($inputFile);
768 open (FASTA, "<$inputFile") or die "Cannot open file $inputFile to read data for gene $gene\n";
769 my (@accList, %speciesName, %fastaSeq, $species, $fastaAcc);
770
771 undef %speciesPresent;
772 while (<FASTA>)
773 {
774 chomp;
775 if ($_ =~ "^>")
776 {
777 ($species, $fastaAcc) = split;
778 $species =~ s/^>//;
779 $fastaAcc =~ s/^\(//;
780 $fastaAcc =~ s/\)$//;
781
782
783 push @accList, $fastaAcc;
784 $speciesName{$fastaAcc} = $species;
785 $speciesPresent{$species}++;
786 push @speciesList, $species if ($speciesPresent{$species} == 1);
787 }
788 else
789 {
790 $fastaSeq{$fastaAcc} .= $_;
791 }
792 }
793 close FASTA;
794 @accList = sort { $speciesName{$a} cmp $speciesName{$b} } keys %speciesName;
795
796 # Pare down to desired number of sequences as needed
797 if ($keepGene)
798 {
799 undef %lengthList;
800 print "\t\tParing down to $keepGene best lengths for each of $speciesCount{$gene} species ...\n" if ($verbose);
801 # Get informative length for each sequence
802 foreach my $entry (@accList)
803 {
804 $infLength{$entry} = ($fastaSeq{$entry} =~ tr/ACGT//);
805 push @ { $lengthList{$speciesName{$entry}} }, $infLength{$entry};
806 }
807
808 # Determine critical length and correct sequence count for number of deleted species
809 foreach my $species (@speciesList)
810 {
811 print "\t\t\tSpecies: $species ($speciesPresent{$species} sequences)\n" if ($debug);
812 next if ($speciesPresent{$species} <= $keepGene); # Only process species for which gene has been sampled more than threshold
813 @ { $lengthList{$species} } = sort {$b <=> $a} @ { $lengthList{$species} }; # Sort in descending order and get critical length
814 $criticalLength{$species} = $lengthList{$species}[$keepGene-1];
815
816 # Correct sequence count
817 my $i = $keepGene;
818 $i++ until ($i >= $speciesPresent{$species} or $lengthList{$species}[$i] < $criticalLength{$species});
819 $sequenceCount{$gene} -= (scalar(@ { $lengthList{$species} }) - $i);
820
821 if ($debug)
822 {
823 print "\t\t\t\tCritical length: $criticalLength{$species}\n";
824 printf "\t\t\t\tNumber of sequences deleted: %s\n", scalar(@ { $lengthList{$species} }) - $i;
825 print "\t\t\t\tTotal number of sequences for gene remaining: $sequenceCount{$gene}\n";
826 }
827 }
828
829 print "\t\t\tFinal count: $sequenceCount{$gene} sequences\n" if ($verbose);
830
831 # Recheck if paring has dropped sequence below sequence threshold
832 if ($sequenceCount{$gene} < $seqThreshold)
833 {
834 print "\t\t\tNOTE: Gene ($sequenceCount{$gene} sequences) now falls below threshold of $seqThreshold sequences; no file created\n" if ($verbose);
835 $geneStatus{$gene} = "rejected";
836 push @rejectList, $gene;
837 $stripCount--;
838
839 # Remove associated file
840 my $rmString = $gene."_gbs.fasta.txt";
841 $rmString =~ s/\s/\\ /g;
842 $rmString =~ s/\(/\\(/g;
843 $rmString =~ s/\)/\\)/g;
844 $rmString =~ s/\'/\\'/g;
845 system ("rm $rmString");
846
847 next;
848 }
849 }
850
851 # Produce appropriate output files
852 print "\t\tSaving output ...\n" if ($verbose);
853 my %printCount;
854
855 # Open files as needed
856 my $sealFile = $gene."_gbs.seal";
857 open (SEAL, ">$sealFile") or die "Cannot open Se-Al formatted file $sealFile for writing\n" if ($sealPrint);
858 my $nexusFile = $gene."_gbs.nex";
859 open (NEX, ">$nexusFile") or die "Cannot open nexus-formatted file $nexusFile for writing\n" if ($nexusPrint);
860 # my $phytabFile = $gene."_gbs.phytab";
861 # open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint);
862 #Currently assumes that only 1 file will be written for ease of Galaxy implementation
863 my $phytabFile = "osiris_gbs.phytab";
864 open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint);
865 my $fastaFile = $gene."_gbs.fasta.txt";
866 $fastaFile = $gene."_gbs.new.fasta.txt" if ($debug);
867 open (FASTA, ">$fastaFile") or die "Cannot open fasta-formatted file $fastaFile for writing\n";
868
869 # Print headers
870 if ($sealPrint)
871 {
872 print SEAL "Database={\n";
873 print SEAL "\tID='MLst';\n";
874 print SEAL "\tOwner=null;\n";
875 print SEAL "\tName=null;\n";
876 print SEAL "\tDescription=null;\n";
877 print SEAL "\tFlags=0;\n";
878 print SEAL "\tCount=2;\n";
879 print SEAL "\t{\n\t\t{\n";
880
881 print SEAL "\t\t\tID='PAli';\n";
882 print SEAL "\t\t\tOwner=1;\n";
883 printf SEAL "\t\t\tName=\"%sgbs\";\n", $gene;
884 print SEAL "\t\t\tDescription=null;\n";
885 print SEAL "\t\t\tFlags=0;\n";
886 print SEAL "\t\t\tNumSites=$maxLength{$gene};\n";
887 print SEAL "\t\t\tType=\"Nucleotide\";\n";
888 print SEAL "\t\t\tFeatures=null;\n";
889 print SEAL "\t\t\tColourMode=1;\n";
890 print SEAL "\t\t\tLabelMode=0;\n";
891 print SEAL "\t\t\ttriplets=false;\n";
892 print SEAL "\t\t\tinverse=true;\n";
893 printf SEAL "\t\t\tCount=%s;\n", $sequenceCount{$gene};
894 print SEAL "\t\t\t{\n";
895 }
896
897 if ($nexusPrint)
898 {
899 print NEX "#NEXUS\n\n";
900 print NEX "Begin data;\n";
901 printf NEX "\tDimensions ntax=%s nchar=%s;\n", $sequenceCount{$gene}, $maxLength{$gene};
902 print NEX "\tFormat datatype=nucleotide gap=-;\n\n";
903 print NEX "\tmatrix\n\n";
904 }
905
906 # Print sequence data
907 my $i = 0;
908 foreach my $entry (@accList)
909 {
910 next if (defined $criticalLength{$speciesName{$entry}} and $infLength{$entry} < $criticalLength{$speciesName{$entry}});
911
912 $printCount{$speciesName{$entry}}++;
913 $speciesName{$entry} .= "_$printCount{$speciesName{$entry}}" if ($printCount{$speciesName{$entry}} > 1);
914
915 if ($sealPrint)
916 {
917 $i++;
918 print SEAL "\t\t\t\t{\n";
919 print SEAL "\t\t\t\t\tID='PSeq';\n";
920 print SEAL "\t\t\t\t\tOwner=2;\n";
921 print SEAL "\t\t\t\t\tName=\"$speciesName{$entry}\";\n";
922 print SEAL "\t\t\t\t\tDescription=null;\n";
923 print SEAL "\t\t\t\t\tFlags=0;\n";
924 print SEAL "\t\t\t\t\tAccession=\"$entry\";\n";
925 print SEAL "\t\t\t\t\tType=\"DNA\";\n";
926 printf SEAL "\t\t\t\t\tLength=%s;\n", length($fastaSeq{$entry});
927 print SEAL "\t\t\t\t\tSequence=\"$fastaSeq{$entry}\";\n";
928 print SEAL "\t\t\t\t\tGeneticCode=1;\n";
929 print SEAL "\t\t\t\t\tCodeTable=null;\n";
930 print SEAL "\t\t\t\t\tFrame=1;\n";
931 print SEAL "\t\t\t\t\tFeatures=null;\n";
932 print SEAL "\t\t\t\t\tParent=null;\n";
933 print SEAL "\t\t\t\t\tComplemented=false;\n";
934 print SEAL "\t\t\t\t\tReversed=false;\n";
935 print SEAL "\t\t\t\t}";
936 print SEAL "," unless ($i == $sequenceCount{$gene});
937 print SEAL "\n";
938 }
939
940 if ($nexusPrint)
941 {
942 my $gaps = "-" x ($maxLength{$gene} - length($fastaSeq{$entry}));
943 print NEX "$speciesName{$entry}\t$fastaSeq{$entry}$gaps\n";
944 }
945 if($phytabPrint)
946 {
947 print PHYTAB "$speciesName{$entry} \t$gene \t $entry \t $fastaSeq{$entry} \n"
948 }
949
950 my $breakPoint = 79;
951 until ($breakPoint > length($fastaSeq{$entry}))
952 {
953 my $replaceString = "\n" . substr($fastaSeq{$entry}, $breakPoint, 1);
954 substr($fastaSeq{$entry}, $breakPoint, 1) = $replaceString;
955 $breakPoint += 80;
956 }
957 if ($accessionFirst) #ADDED BY THO
958 {
959 print FASTA ">$entry", "\___", "$speciesName{$entry}\n";
960 print FASTA "$fastaSeq{$entry}\n";
961 }
962 else
963 {
964 print FASTA ">$speciesName{$entry} ($entry)\n";
965 print FASTA "$fastaSeq{$entry}\n";
966 }
967 }
968 # Print footers
969 if ($sealPrint)
970 {
971 print SEAL "\t\t\t};\n";
972 print SEAL "\t\t},\n";
973 print SEAL "\t\t{\n";
974 print SEAL "\t\t\tID='MCoL';\n";
975 print SEAL "\t\t\tOwner=1;\n";
976 print SEAL "\t\t\tName=\"Genetic Codes\";\n";
977 print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n";
978 print SEAL "\t\t\tFlags=0;\n";
979 print SEAL "\t\t\tCount=0;\n";
980 print SEAL "\t\t}\n";
981 print SEAL "\t};\n";
982 print SEAL "};\n";
983 }
984 if ($nexusPrint)
985 {
986 print NEX "\t;\nend;\n";
987 }
988
989 # Close files as appropriate
990 close SEAL if ($sealPrint);
991 close NEX if ($nexusPrint);
992 close FASTA if ($keepGene);
993 }
994 }
995
996 my $postTime = time - $postZero;
997 print "\n" if ($verbose);
998 print "\tTime taken: $postTime seconds\n";
999
1000 # Print out final summary stats / files
1001 print "\nFinal summary statistics\n";
1002
1003 # Print file of stripped genes
1004 open (STRIP, ">$stripFile") or die "Cannot write summary of stripped genes to $stripFile.\n";
1005 print STRIP "The following $stripCount genes were stripped successfully from file $gbFile\n";
1006 print STRIP "\nGene\tNo. of sequences\tNo. of species\n\n";
1007
1008 foreach my $gene (@globalGeneList)
1009 {
1010 print STRIP "$gene\t$sequenceCount{$gene}\t$speciesCount{$gene}\n" unless ($geneStatus{$gene} eq "rejected");
1011 }
1012 close STRIP;
1013
1014 print "\tSummaries of $stripCount genes that were stripped from $gbFile have been written to $stripFile\n";
1015
1016 # Print file of rejected genes
1017 if (@rejectList)
1018 {
1019 @rejectList = sort @rejectList;
1020 open (REJECT, ">$rejectFile") or die "Cannot write summary of rejected genes to $rejectFile.\n";
1021 printf REJECT "The following %s genes do not meet user-defined threshold(s) of", scalar(@rejectList);
1022 print REJECT " $seqThreshold sequences" if ($seqThreshold);
1023 print REJECT " or" if ($seqThreshold and $speciesThreshold);
1024 print REJECT " $speciesThreshold species" if ($speciesThreshold);
1025 print REJECT "; an asterisk indicates that values given are rough maximums\n";
1026 print REJECT "\nGene\tNo. of sequences\tNo. of species\n\n";
1027
1028 foreach my $gene (@rejectList)
1029 {
1030 print REJECT "$gene\t";
1031 if (defined $sequenceCount{$gene}) # Need to do alternative versions depending on whether gene was properly counted or not
1032 {
1033 print REJECT "$sequenceCount{$gene}\t";
1034 }
1035 else
1036 {
1037 print REJECT "$quickSequenceCount{$gene} \*\t";
1038 }
1039 if (defined $speciesCount{$gene})
1040 {
1041 print REJECT "$speciesCount{$gene}\n";
1042 }
1043 else
1044 {
1045 print REJECT "$quickSpeciesCount{$gene} \*\n";
1046 }
1047 }
1048 close REJECT;
1049
1050 printf "\tSummaries of %s genes that did not meet threshold(s) have been written to $rejectFile\n", scalar(@rejectList);
1051 }
1052
1053 if ($debug)
1054 {
1055 print "\n\tSummary stats for each processed gene:\n";
1056 foreach my $gene (@globalGeneList)
1057 {
1058 print "\t\tGene: $gene ($geneStatus{$gene})\n";
1059 print "\t\t\tTotal number of sequences: $sequenceCount{$gene}\n";
1060 print "\t\t\tTotal number of unique species: $speciesCount{$gene}\n";
1061 }
1062 }
1063
1064 exit(0);
1065
1066 # Subroutines used in the script
1067
1068 sub setLineBreak # Check line breaks of input files and set input record separator accordingly
1069 {
1070 my $gbFile = shift;
1071 $/ ="\n";
1072 open (IN, "<$gbFile") or die "Cannot open $gbFile to check form of line breaks.\n";
1073 while (<IN>)
1074 {
1075 if ($_ =~ /\r\n/)
1076 {
1077 print "\tDOS line breaks detected ...\n" if ($debug);
1078 $/ ="\r\n";
1079 last;
1080 }
1081 elsif ($_ =~ /\r/)
1082 {
1083 print "\tMac line breaks detected ...\n" if ($debug);
1084 $/ ="\r";
1085 last;
1086 }
1087 else
1088 {
1089 print "\tUnix line breaks detected ...\n" if ($debug);
1090 $/ ="\n";
1091 last;
1092 }
1093 }
1094 close IN;
1095 }
1096
1097 sub complement # Outputs complementary sequence to one provided
1098 {
1099 my $tempSeq = shift;
1100
1101 my $compSeq;
1102 for (my $nt = 0; $nt < length($tempSeq); $nt++)
1103 {
1104 if (not defined $complement{substr($tempSeq, $nt, 1)})
1105 {
1106 $compSeq .= "?";
1107 }
1108 else
1109 {
1110 $compSeq .= $complement{substr($tempSeq, $nt, 1)};
1111 }
1112 }
1113 my $revCompSeq = reverse($compSeq);
1114 return $revCompSeq;
1115 }
1116
1117 sub geneSynonyms # Define gene synonyms; originally compiled by Robin Beck
1118 {
1119 #Opsin gene added by THO
1120 $synonym{$_} = "opsin" foreach ("opsin", "anceropsin", "blop", "blue-sensitive_opsin", "blue-sensitive_opsin_precursor", "blue-sensitive_rhodopsin", "blue-sensitive_visual_pigment", "blue_opsin", "bluerh", "boceropsin", "buvops", "compound_eye_opsin_bcrh1", "compound_eye_opsin_bcrh2", "lateral_eye_opsin", "locus_opsin_1", "locust_opsin_2", "long-wavelenght_opsin", "long-wavelength_like_opsin", "long-wavelength_opsin", "long-wavelength_rhodopsin", "long-wavelength_sensitive_opsin_1", "long-wavelength_sensitive_opsin_2", "long_wave_opsin", "long_wavelength-sensitive_opsin", "long_wavelength-sensitive_rhodopsin", "long_wavelength-sensitive_visual_pigment", "long_wavelength_opsin", "long_wavelength_sensitive_opsin_1", "long_wavelength_sensitive_opsin_2", "lop2", "lw_opsin",
1121 "ocellar_opsin", "opsin", "opsin_2", "opsin_bcrh1", "opsin_bcrh2",
1122 "opsin_rh1", "opsin_rh3", "opsin_rh4", "piceropsin", "pteropsin",
1123 "rh1_opsin", "rh2_opsin", "rh3_opsin", "rh4_opsin", "rh6_rhodopsin",
1124 "rhodopsin", "rhodopsin_1", "rhodopsin_2_cg16740-pa", "rhodopsin_3",
1125 "rhodopsin_3_cg10888-pa", "rhodopsin_4", "rhodopsin_4_cg9668-pa",
1126 "rhodopsin_5", "rhodopsin_5_cg5279-pa", "rhodopsin_6",
1127 "rhodopsin_6_cg5192-pb", "rhodopsin_7_cg5638-pa",
1128 "rhodopsin_long-wavelength", "short_wavelength-sensitive_opsin",
1129 "ultraviolet-sensitive_opsin", "ultraviolet-sensitive_rhodopsin",
1130 "ultraviolet-sensitive_visual_pigment", "ultraviolet_sensitive_opsin",
1131 "uv-sensitive_opsin", "uv-wavelength_like_opsin", "uv_opsin", "uvop",
1132 "uvrh", "uvrh1", "amblop", "amuvop",
1133 "lwrh", "lwrh1", "lwrh2", "lw", "lw-rh", "lw_rh", "ninae", "ninae-pa",
1134 "op", "ops", "ops1", "opsin_1", "opsin_3", "rh", "rh1", "rh2", "rh3",
1135 "rh2-pa", "rh3-pa", "rh4", "rh4-pa", "rh5", "rh6", "rh7", "rho",
1136 "visual_pigment", "long-wavelength_rodopsin",
1137 "long_wavelength_rhodopsin", "short-wavelength_rhodopsin");
1138
1139 #Chloroplast genes added by THO***
1140 $synonym{$_} = "rbcl" foreach ("rbcl", "large_subunit_of_riblose-15-bisphosphate_carboxylase-oxygenase","larger_subunit_of_rubisco",
1141 "rubisco_large_subunit", "ribulose_15-bisphosphate_carboxylase_large_subunit", "ribulosebiphosphate_carboxylase_large_subunit");
1142 $synonym{$_} = "matk" foreach ("matk", "maturase_k", "maturase");
1143
1144 # Mitochondrial genes
1145 $synonym{$_} = "mtatp6" foreach ("mtatp6", "atp synthase 6", "atp synthase a chain", "atp6", "mt-atp6",
1146 "atpase subunit 6", "atpase6", "atpase 6", "atpase6 protein", "atp6",
1147 "atp synthase f0 subunit 6", "atp synthase a chain protein 6",
1148 "atp synthase subunit 6", "atpase6", "atp6", "atp sythase subunit 6",
1149 "atpase subunit 6, atpase6", "f0-atpase subunit 6", "f1atpase subunit 6",
1150 "f1-atpase subunit 6", "atpase 6", "atpase 6 gene", "atpase subunit 6 (atp6)",
1151 "atpase subunit 6 (atpase6)");
1152 $synonym{$_} = "mtatp8" foreach ("mtatp8", "atp synthase 8", "atp synthase protein 8", "atpase subunit 8",
1153 "a6l", "atp8", "mt-atp8", "atpase subunit 8", "atpase8", "atpase 8",
1154 "atpase8 protein", "atp8", "atp synthase f0 subunit 8", "atp synthase protein 8",
1155 "atpase-8", "atp synthase subunit 8", "atpase8", "atp8", "atp sythase subunit 8",
1156 "atpase subunit8", "f0-atpase subunit 8", "f1 atpase subunit 8", "f1-atpase subunit 8",
1157 "protein a6l", "atpase 8", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)",
1158 "atpase 8 gene", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)");
1159 $synonym{$_} = "mtco1" foreach ("mtco1", "cytochrome c oxidase i", "coi", "mt-co1", "co i", "ccoi", "cox 1", "cox i",
1160 "coi", "cytochrome c oxidase subunit i", "cytochrome oxidase subunit i", "cox1",
1161 "cytochrome c oxidase subunit 1", "cytochrome oxidase subunit 1", "co1",
1162 "cytochrome c oxidase polypeptide i", "cytochrome oxidase i", "cox-1",
1163 "cytochrome oxidase c subunit 1", "cox1", "co i", "co1", "cox 1", "coxi",
1164 "cytochrome c oxidase polypeptide i", "cytochrome c-oxidase subunit 1",
1165 "cytochrome oxidase c subunit i", "cytochrome-c oxidase i", "cytochrome c1 mrna",
1166 "cytochrome c1", "cytochrome c oxidase subunit 1 (coxi)", "cytochrome c oxidase chain i",
1167 "cytochrome c1 (aa 1-241)", "cytochrome c oxidase polypeptide 1",
1168 "cytochrome c oxidase subunit 1 (coi)", "cytochrome c-1");
1169 $synonym{$_} = "mtco2" foreach ("mtco2", "cytochrome c oxidase ii", "cytochrome c oxidase polypeptide ii", "coii",
1170 "mt-co2", "coii", "cytochrome c oxidase subunit ii", "cytochrome oxidase subunit ii",
1171 "cytochrome oxidase subunit 2", "cytochrome c oxidase subunit 2",
1172 "cytochrome c oxidase ii", "cox2", "co2", "cytochrome c oxidase polypeptide ii",
1173 "cytochrome oxidase ii", "cox2", "cytochrome oxidase c subunit 2",
1174 "cytochrome-c oxidase", "co ii", "co2", "cox 2", "cytochrome c oxidase polypeptide ii",
1175 "cytochrome c-oxidase subunit 2", "cytochrome oxidase c subunit ii",
1176 "cytochrome oxidase subunit2", "cytochrome-c oxidase ii", "cytochrome c oxidase subunit 2 (coxii)",
1177 "cytochrome c oxidase subunit 2 (coii)", "cytochrome c oxidase chain ii",
1178 "cytochrome c oxidase polypeptide 2", "cytochrome c oxidase ii subunit",
1179 "cytochrome c oxidase ii mrna");
1180 $synonym{$_} = "mtco3" foreach ("mtco3", "cytochrome c oxidase iii", "coiii", "mt-co3", "co iii", "ccoiii", "cox3",
1181 "cox iii", "cytochrome oxidase subunit iii", "coiii", "cox iii",
1182 "cytochrome c oxidase subunit iii", "cytochrome oxidase subunit 3", "cox3",
1183 "cytochrome c oxidase subunit 3", "co3", "cytochrome c oxidase polypeptide iii",
1184 "cox3", "cytochrome oxidase c subunit 3", "cytochrome oxidase iii",
1185 "cytochrome c oxidase polypeptide iii", "co iii", "coiii", "coiii protein", "coxiii",
1186 "cytochrome c oxidase subunit iii, coiii", "cytochrome c-oxidase subunit 3",
1187 "cytochrome c-oxidase subunit three", "cytochrome oxidase c subunit iii",
1188 "cytochrome-c oxidase iii", "cytochrme c oxidase iii", "cytochrome c oxidase subunit 3 (coiii)",
1189 "cytochrome oxidase subunit iii type 2");
1190 $synonym{$_} = "mtnd1" foreach ("mtnd1", "nd1", "nadh dehydrogenase 1", "nadh dehydrogenase, subunit 1 (complex i)",
1191 "mt-nd1", "nadh-ubiquinone oxidoreductase chain 1", "nd1", "nadh1", "nd1",
1192 "nadh-ubiquinone oxidoreductase chain 1", "nadh-ubiquinone oxidoreductase subunit 1",
1193 "nadh dehydrogenase i", "nadh dehydrogenase 1", "nadh subunit 1",
1194 "nadh dehydrogenase subnuit 1", "nadh1", "nadh1 protein",
1195 "nadh-ubiquinone oxidoreductase subunit i", "nadh dehydrogenase subunit 1",
1196 "nadh dehydrogenase subunit 1 (nd1)", "nadh dehydrogenase subunit 1 type 2",
1197 "nadh dehydrogenase subunit 1 type 1", "nadh dehydrogenase subunit i", "nadh dehydrogenase i",
1198 "nadh dehydrogenase subnit 1", "nadh dehydrogenase ubiquinone 1 alpha",
1199 "nadh dehydrogenase subunit i");
1200 $synonym{$_} = "mtnd2" foreach ("ndh-u1", "mtnd2", "nadh dehydrogenase 2", "nadh dehydrogenase, subunit 2 (complex i)",
1201 "nadh-ubiquinone oxidoreductase chain 2", "nd2", "mt-nd2", "urf2",
1202 "nadh dehydrogenase subunit 2", "nd2", "nadh2",
1203 "nadh-ubiquinone oxidoreductase chain 2", "nadh dehydrogenase 2", "nadh subunit 2",
1204 "nadh-ubiquinone oxidoreductase subunit 2", "nadh dehydrogenase subnuit 2",
1205 "nadh deydrogenase subunit 2", "nadh2", "nadh-ubiquinone oxidoreductase subunit ii",
1206 "nd2", "nadh2 protein", "nadh dehydrogenase subunit 2 (nd2)", "nadh dehydrogenase subunit ii",
1207 "nadh dehydrogenase ii", "nadh dehydrogense 2", "nadh dehydrogenase subunit ii");
1208 $synonym{$_} = "mtnd3" foreach ("mtnd3", "ndh-u3", "nadh dehydrogenase 3", "nd3", "nadh3",
1209 "nadh-ubiquinone oxidoreductase chain 3",
1210 "nadh-ubiquinone/plastoquinone oxidoreductase chain 3", "mt-md3", "urf3",
1211 "nadh dehydrogenase subunit 3", "nd3", "nadh3",
1212 "nadh-ubiquinone oxidoreductase chain 3", "nadh subunit 3", "nadh3", "nadh3 protein",
1213 "nadh-ubiquinone oxidoreductase subunit 3", "nadh dehydrogenase subnuit 3",
1214 "nadh-ubiquinone oxidoreductase subunit iii", "nadh3 protein",
1215 "nadh dehydrogenase subunit 3 (nd3)", "nadh dehydrogenase subunit iii", "nadh dehydrogenase iii",
1216 "nadh dehydrogenase subunit iii");
1217 $synonym{$_} = "mtnd4" foreach ("mtnd4", "ndh-u4", "nadh dehydrogenase 4", "nadh-ubiquinone oxidoreductase chain 4", "mt-nd4",
1218 "nd4", "urf4", "nadh:ubiquinone oxidoreductase subunit 4 (chain m)",
1219 "nadh dehydrogenase subunit 4", "nd4", "nadh4",
1220 "nadh-ubiquinone oxidoreductase chain 4", "nadh subunit 4", "nadh4",
1221 "nadh-ubiquinone oxidoreductase subunit 4", "nadh dehydrogenase subunit4",
1222 "nadh-ubiquinone oxidoreductase subunit iv", "nadh4 protein",
1223 "nadh dehydrogenase subunit 4 (nd4)", "nadh dehydrogenase subunit iv", "nadh dehydrogenase iv",
1224 "nadh dehydrogenase subunit iv");
1225 $synonym{$_} = "mtnd4l" foreach ("mtnd4l", "ndh-u4l", "nadh dehydrogenase 4l", "nadh dehydrogenase, subunit 4l (complex i)",
1226 "nadh-ubiquinone oxidoreductase chain 4l", "nd4l", "mt-md4l", "urf4l",
1227 "nadh dehydrogenase subunit 4l", "nd4l", "nadh4l", "nadh-ubiquinone oxidoreductase chain 4l",
1228 "nadh subunit 4l", "nadh4l", "nadh-ubiquinone oxidoreductase subunit 4l",
1229 "nadh dehydrogenase subunit 4 l", "nadh4l protein", "nadh dehydrogenase subunit 4l (nd4l)",
1230 "nadh dehydrogenase subunit ivl", "nadh dehydrogenase ivl", "nadh dehydrogenase subunit ivl");
1231 $synonym{$_} = "mtnd5" foreach ("mtnd5", "ndh-u5", "nadh dehydrogenase 5", "nd5", "nadh-ubiquinone oxidoreductase chain 5",
1232 "mt-nd5", "urf5", "nadh dehydrogenase subunit 5", "nadh5", "nd5",
1233 "nadh dehydrogenase-5", "nadh-ubiquinone oxidoreductase chain 5",
1234 "nadh dehydrogenase 5", "nadh 5", "nadh subunit 5",
1235 "nadh-ubiquinone oxidoreductase subunit 5", "nadh5", "nadh-dehydrogenase subunit 5",
1236 "nadh-dehydrogenase subunit 5, nd5", "nadh-ubiquinone oxidoreductase subunit v",
1237 "nadh5 protein", "nadh dehydrogenase subunit 5 (nd5)", "nadh dehydrogenase subunit v",
1238 "nadh dehydrogenase v", "nadh dehydrogenase subunit v");
1239 $synonym{$_} = "mtnd6" foreach ("mtnd6", "ndh-u6", "nadh dehydrogenase 6", "nd6", "nadh6",
1240 "nadh-ubiquinone oxidoreductase chain 6",
1241 "nadh-ubiquinone/plastoquinone oxidoreductase chain 6", "mt-md6", "urf6",
1242 "nadh dehydrogenase subunit 6", "nd6", "nadh6",
1243 "nadh-ubiquinone oxidoreductase chain 6", "nadh subunit 6",
1244 "nadh-ubiquinone oxidoreductase subunit 6", "nadh dehydrogenase 6", "nadh6",
1245 "nadh-ubiquinone oxidoreductase subunit vi", "nadh6 protein",
1246 "nadh dehydrogenase subunit 6 (nd6)", "nadh dehydrogenase subunit vi", "nadh dehydrogenase vi",
1247 "nadh dehydrogenase subunit vi");
1248 $synonym{$_} = "mtrnr1" foreach ("mtrnr1", "mt-rnr1", "12s rna", "12s rrna", "12s ribosomal rna", "12s rrna",
1249 "small subunit ribosomal rna", "12 rrna", "12 s ribosomal rna", "12s rrna, mtssu rrna",
1250
1251 "12s rrna gene", "12s small subunit ribosomal rna",
1252 "mitochondrial ssu ribosomal rna", "rrns",
1253 "ssu ribosomal rna", "12s","12s_ribosmal_rna", "s-rrna", "srrna", "ssu");
1254 $synonym{$_} = "mtrnr2" foreach ("16s", "16s_ribosomal_rna","l-rrna","lrrna","lsu", "mtrnr2", "mt-rnr2", "16s rrna", "16s rna", "16srrna", "16s ribosomal rna", "16s rrna", "rrna large subunit",
1255 "large subunit ribosomal rna", "16 s ribosomal rna", "16s mitochondrial ribosomal rna",
1256
1257 "mitochondrial 16s ribosomal rna", "16s large subunit ribosomal
1258 rna", "16s_ribosmal_rna","rrnl");
1259
1260 # Nuclear genes
1261 $synonym{$_} = "adora3" foreach ("adora3", "adenosine a3 receptor", "a3ar", "ara3", "gpcr2", "adora3", "adenosine a3 receptor",
1262 "a3 adenosine receptor", "adenosine-3 receptor");
1263 $synonym{$_} = "adra2b" foreach ("adra2b", "adrenergic, alpha-2b-, receptor", "adra2l1", "adrarl1", "adra2rl1",
1264 "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor", "subtype c2", "[a]2b", "adra-2b",
1265 "alpha2-c2", "alpha2b", "subtype alpha2-c2", "alpha-2-adrenergic receptor-like 1",
1266 "alpha adrenergic receptor 2b", "adra2b", "alpha 2b adrenergic receptor", "adra2b",
1267 "alpha adrenergic receptor subtype 2b", "alpha-2b-adrenergic receptor",
1268 "alpha adrenergic receptor, subtype 2b", "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor");
1269 $synonym{$_} = "adrb2" foreach ("adrb2", "adrenergic, beta-2-, receptor, surface", "adrb2r", "adrbr", "beta-2 adrenergic receptor",
1270 "b2ar", "bar", "beta-2 adrenoceptor", "catecholamine receptor", "adrb-2", "badm", "beta 2-ar",
1271 "gpcr7", "adrb2", "beta-2 adrenergic receptor", "beta-2-adrenergic receptor",
1272 "adrenergic receptor beta 2", "beta 2 adrenergic receptor", "beta2-adrenergic receptor",
1273 "beta2 adrenergic receptor");
1274 $synonym{$_} = "apob" foreach ("apob", "apolipoprotein b", "ag(x) antigen", "apolipoprotein b-100 precursor", "apo b-100",
1275 "apolipoproteinb-48", "apo b-48", "fldb", "apob-100", "apolipoprotein b", "apolipoprotein b 100",
1276 "apob", "apolipoprotein b-100", "apob");
1277 $synonym{$_} = "app" foreach ("app", "amyloid beta (a4) precursor protein", "protease nexin-ii, alzheimer disease", "ad1",
1278 "human mrna for amyloid a4 precursor of alzheimer's disease", "appi", "beta-amyloid protein",
1279 "beta-app", "a-beta", "a4", "cvap", "aaa", "abeta", "amyloid beta-peptide",
1280 "amyloid beta (a4) precursor protein", "adap", "appican", "betaapp", "app",
1281 "amyloid beta precursor protein", "amyloid precursor protein", "amyloid beta precursor b1",
1282 "beta amyloid protein precursor", "beta-a4");
1283 $synonym{$_} = "atp7a" foreach ("atp7a", "atpase, cu++ transporting, alpha polypeptide (menkes syndrome)", "mnk",
1284 "copper-transporting atpase 1", "copper pump 1", "menkes disease-associated protein", "mc1",
1285 "copper binding p-type atpase 7a", "blo", "blotchy", "br", "brindled", "menkes protein", "mo",
1286 "mottled", "mk", "ohs", "atp7a", "copper transporting atpase",
1287 "atpase, cu++ transporting, alpha polypeptide", "menkes syndrome protein");
1288 $synonym{$_} = "bdnf" foreach ("bdnf", "brain-derived neurotrophic factor", "brain-derived neurotrophic factor precursor", "bdnf",
1289 "brain-derived neurotrophic factor", "brain derived neurotrophic factor",
1290 "brain-derived neurotrophic factor mature peptide", "bdnf");
1291 $synonym{$_} = "bmi1" foreach ("bmi1", "b lymphoma mo-mlv insertion region (mouse)",
1292 "murine leukemia viral (bmi-1) oncogene homolog", "polycomb complex protein bmi-1", "rnf51",
1293 "mgc12685", "oncogene bmi-1", "bmi1", "bmi-1", "bmi-1 protein", "oncoprotein bmi-1");
1294 $synonym{$_} = "brca1" foreach ("brca1", "breast cancer 1, early onset", "pscp", "papillary serous carcinoma of the peritoneum",
1295 "breast cancer type 1 susceptibility protein", "breast and ovarian cancer susceptibility gene",
1296 "rnf53", "breast-ovarian cancer, included", "brca1", "brca1",
1297 "breast and ovarian cancer susceptibility protein", "brca1 protein",
1298 "breast and ovarian cancer susceptibility");
1299 $synonym{$_} = "chrna1" foreach ("chrna1", "cholinergic receptor, nicotinic, alpha polypeptide 1 (muscle)", "chrna",
1300 "acetylcholine receptor protein, alpha chain precursor", "achra", "achr-1", "acra", "chrna1",
1301 "nicotinic cholinergic receptor alpha polypeptide", "chrna1", "achr", "achr prepeptide", "chrna",
1302 "neuronal nicotinic acetylcholine receptor alpha", "nicotinic acetylcholine recepter alpha-subunit");
1303 $synonym{$_} = "cftr" foreach ("cftr", "cystic fibrosis transmembrane conductance regulator, atp-binding cassette (sub-family c,
1304 member 7)", "mrp7", "cf", "abcc7", "abc35", "cftr", "cystic fibrosis transmembrane conductance",
1305 "cftr chloride channel");
1306 $synonym{$_} = "cnr1" foreach ("cnr1", "cannabinoid receptor 1 (brain)", "cnr", "cb1", "cb-r", "cann6", "cb>1<", "cb1k5", "cb1a",
1307 "central cannabinoid receptor", "cnr1", "cannabinoid receptor 1", "cb1", "cb1 cannabinoid receptor",
1308 "cb1 cannabinoid receptor", "cbr");
1309 $synonym{$_} = "crem" foreach ("crem", "camp responsive modulator", "crea", "camp-responsive element modulator, alpha isoform",
1310 "crem", "camp responsive element moderator", "camp responsive element modulator",
1311 "camp-responsive element moderator");
1312 $synonym{$_} = "edg1" foreach ("edg1", "endothelial differentiation, sphingolipid g-protein-coupled receptor, 1", "d1s3362", "edg-1",
1313 "1pb1", "s1p1", "ecgf1", "chedg1", "sphingosine 1-phosphate receptor edg1",
1314 "g protein-coupled sphingolipid receptor", "edg1");
1315 $synonym{$_} = "ghr" foreach ("ghr", "growth hormone receptor", "growth hormone receptor precursor", "gh receptor",
1316 "serum binding protein", "growth hormone receptor", "ghr", "growth hormone receptor precursor",
1317 "ghr", "bovine growth hormone receptor", "mature growth hormone receptor");
1318 $synonym{$_} = "pgk1" foreach ("pgk1", "phosphoglycerate kinase 1", "pgk-1", "primer recognition protein 2", "prp 2", "pgka",
1319 "pgk-1", "pgk-1", "phosphoglycerate kinase 1");
1320 $synonym{$_} = "plcb4" foreach ("plcb4", "phospholipase c, beta 4",
1321 "1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta 4", "plc-beta-4",
1322 "phospholipase c-beta-4", "plcb4", "phospholipase c beta 4");
1323 $synonym{$_} = "pnoc" foreach ("pnoc", "prepronociceptin", "propronociceptin", "nociceptin precursor", "orphanin fq", "ppnoc", "ofq",
1324 "n/ofq", "n23k", "npnc1", "ofq/n", "proorphanin", "pnoc", "prepronociceptin",
1325 "nociceptin/orphanin fq precursor");
1326 $synonym{$_} = "prkci" foreach ("prkci", "protein kinase c, iota", "pkci", "dxs1179e", "npkc-iota", "protein kinase c iota",
1327 "prkci");
1328 $synonym{$_} = "prnp" foreach ("prnp", "(prion protein (p27-30) (creutzfeld-jakob disease, gerstmann-strausler-scheinker syndrome, fatal familial insomnia)",
1329 "cjd", "major prion protein precursor", "prp", "prp27-30", "prp33-35c", "ascr", "prip", "gss",
1330 "prn-i", "prn-p", "prpc", "prpsc", "sinc", "mgc26679", "cd230 antigen", "prion-related protein",
1331 "prion protein", "prp", "prnp", "prion protein precursor", "prion protein", "prnp", "prp",
1332 "prion protein prp", "prion protein precursor prp", "prp", "prp", "prp gene", "greater kudu prp",
1333 "major prion protein", "prion protein variant 110p", "prion protein variant 143r",
1334 "prion protein variant 240s", "prion protein variant 37v", "prion protein variant 37v/240s",
1335 "prion protein, prp", "prnp", "prmp");
1336 $synonym{$_} = "rag1" foreach ("rag1", "recombination activating gene 1", "v(d)j recombination activating protein 1", "rag-1",
1337 "recombination activating protein 1", "rag1", "rag-1", "recombination activating gene-1",
1338 "recombination activating gene 1", "recombination-activating gene 1", "rag1", "rag 1", "rag1");
1339 $synonym{$_} = "rag2" foreach ("rag2", "recombination activator protein 2", "recombination activating protein 2", "rag-2", "rag 2",
1340 "recombination activating protein", "recombination activating gene-2", "rag2",
1341 "recombination activating protein 2", "rag2", "rag2", "rag2", "rag-2", "rag-2 protein",
1342 "recombinase activating gene 2", "recombination activating protein 2");
1343 $synonym{$_} = "rbp3" foreach ("rbp3", "retinol-binding protein 3, interstitial", "interphotoreceptor retinoid-binding protein precursor",
1344 "irbp", "interstitial retinol-binding protein", "rbp-3", "interphotoreceptor retinoid binding protein",
1345 "irbp", "irbp", "interphotoreceptor retinoid-binding protein", "interphotorecepter retinoid binding protein",
1346 "interphotoreceptor retinoid binding protein (irbp)", "irbp mrna");
1347 $synonym{$_} = "tnf" foreach ("tnf", "tumor necrosis factor (tnf superfamily, member 2)", "tnfa", "dif", "tnfsf2",
1348 "tumor necrosis factor (cachectin)", "tumor necrosis factor, alpha (cachectin)",
1349 "tumor necrosis factor", "tnf-alpha", "cachectin", "tnfsf1a", "apc1 protein",
1350 "tnf, monocyte-derived", "tnf, macrophage-derived", "tnf superfamily, member 2",
1351 "tumor necrosis factor-alpha", "tumor necrosis factor alpha", "tnfa", "tnf-alpha",
1352 "tumor necrosis factor alpha precursor", "tnfa", "tnfa", "tnfalpha", "tumour necrosis factor alpha",
1353 "tnf alpha", "tnf-a", "tumor necrosis factor alpha, tnf-alpha", "bovine tumor necrosis factor alpha",
1354 "tumor necrosis factor alpha (cachetin)", "tumor necrosis factor-alpha precursor");
1355 $synonym{$_} = "tp53" foreach ("tp53", "tumor protein p53 (li-fraumeni syndrome)", "p53", "cellular tumor antigen p53",
1356 "phosphoprotein p53", "trp53", "transformation related protein 53", "p53", "p53 protein", "p53",
1357 "tp53", "tumor suppressor p53", "53 kda phosphoprotein", "insulin recptor substrate p53 short form",
1358 "p53 gene product", "p53 tumor suppressor gene", "p53 tumor suppressor protein",
1359 "tumor suppressor p53 phosphoprotein");
1360 $synonym{$_} = "ttr" foreach ("ttr", "transthyretin (prealbumin, amyloidosis type i)", "palb", "tthy", "tbpa", "attr",
1361 "transthyretin", "transthyretin precursor", "transthyretin subunit", "ttr", "ttr");
1362 $synonym{$_} = "tyr" foreach ("tyr", "tyrosinase (oculocutaneous albinism ia)", "ocaia", "monophenol monooxygenase",
1363 "tumor rejection antigen ab", "sk29-ab", "lb24-ab", "albino", "c", "skc35", "oca1a", "tyrosinase",
1364 "tyr", "tyrosinase precursor", "truncated tyrosinase", "tyr", "tyr");
1365 $synonym{$_} = "vwf" foreach ("vwf", "von willebrand factor", "f8vwf", "coagulation factor viii", "von willebrand factor", "vwf",
1366 "von willebrand factor", "vwf", "vwf", "vwf", "von willebrand factor, vwf", "von willebrand factor precursor",
1367 "von willebrand factor precursor", "wf");
1368 $synonym{$_} = "zfx" foreach ("zfx", "zinc finger protein, x-linked", "zinc finger x-chromosomal protein", "zfx", "zinc finger protein zfx",
1369 "zfx", "zinc finger protein zfx", "x-linked zinc finger protein zfx", "zfx", "x-linked zinc finger protein",
1370 "zinc finger x-linked protein 1", "zinc finger x-linked protein 2", "zinc finger protein x linked",
1371 "zfx1", "zfx-1", "zfx2", "zfx-2","zfx protein mrna", "zinc finger protein zfx isoform 4",
1372 "zfx product, isoform 1", "zfx product, isoform 2", "zfx product, isoform 3",
1373 "zfx product, isoform 4", "zfx protein");
1374 $synonym{$_} = "zfy" foreach ("zfy", "zinc finger protein, y-linked", "zinc finger y-chromosomal protein", "zfy", "zfy",
1375 "zinc finger protein zfy", "zinc finger protein zfy", "y-linked zinc finger protein",
1376 "y-linked zinc finger protein zfy", "zinc finger protein y linked",
1377 "y-linked zinc finger protein 2", "y-linked zinc finger protein 1", "zfy1", "zfy-1", "zfy2",
1378 "zfy-2");
1379
1380 # Additional genes
1381 $synonym{$_} = "18s_rrna" foreach ("18s rrna", "18s_rrna", "18s ribosomal rna",
1382 "18 rrna", "18 s ribosomal rna", "18 s ribosomal rna", "18s small subunit ribosomal rna",
1383 "18s_ribsomal_rna", "18s_ribosomal_rna_a-type", "18s_rna_gene", "18s_rrna", "nuclear_18s_ribosomal_rna");
1384
1385 $synonym{$_} = "28s_rrna" foreach ("28s_large_subunit_ribosomal_rna", "28s rrna", "28s_rrna", "28s ribosomal rna", "28 rrna", "put. 28S ribosomal RNA", "28 s ribosomal rna", "28 s ribosomal rna",
1386 "rrna-28s_rrna-related", "28s ribosomal rna v region");
1387 $synonym{$_} = "5.8s" foreach ("5.8s_ribosomal_rna", "5.8s_rrna", "5s_ribosomal_rna","5s_rrna","5.8s_rna_gene");
1388 $synonym{$_} = "mtcyb" foreach ("cob", "mtcyb", "cyt b", "cytb", "cytochrome b", "cytochrome b", "cytb", "cyt b", "cytb", "cytb",
1389 "'cytochrome b'", "cytochrome-b", "skeletal muscle cytochrome b", "cyt-b", "cyt.b", "cyto b",
1390 "cytob", "cytb1", "cytb2", "cytb3", "cytb4", "cytochrome b light strand", "cyt-b", "cytob",
1391 "cyt.b", "cytb", "cytochrome b", "mitochondrial cytochrome b", "cytochrome b protein",
1392 "duodenal cytochrome b");
1393 $synonym{$_} = "alpha_lactalbumin" foreach ("alpha lactalbumin", "alpha_lactalbumin", "alpha-lactalbumin", "alpla lactalbumin",
1394 "alpah lactalbumin", "a-lacta", "a-lacta", "lactalbumin, alpha-", "lactalbumin, alpha",
1395 "mature alpha-lactalbumin");
1396 $synonym{$_} = "alpha-s1-casein" foreach ("alpha-s1-casein", "as1-casein", "alpha s1 casein", "alpha s1-casein b",
1397 "alpha s1 casein", "alpha s1-casein", "alpha(s1) casein", "alpha-s1 casein", "alpha-s1 casein",
1398 "alpha-s1-casein", "alpha-s1-casein mrna", "alphas1-casein", "as1-casein", "alfas1-casein",
1399 "alpha-sl casein", "casein alpha s1");
1400 $synonym{$_} = "alpha-s2-casein" foreach ("alpha-s2-casein", "as2-casein", "alpha s2 casein", "alpha s2-casein b",
1401 "alpha s2 casein", "alpha s2-casein", "alpha(s2) casein", "alpha-s2 casein", "alpha-s2 casein",
1402 "alpha-s2-casein", "alpha-s2-casein mrna", "alphas2-casein", "as2-casein",
1403 "alpha s2a casein (aa 1 to 165)", "alpha s2a casein (aa 1 to 167)", "alph as2-casein",
1404 "alpha(s2)-casein");
1405 $synonym{$_} = "b-casein" foreach ("b-casein", "beta casein", "beta-casein", "beta-casein (aa 1 - 213)", "beta-casein a3",
1406 "beta-casein precursor", "beta casein precursor", "beta-casein variant a2",
1407 "beta-casein variant i", "mat. beta-casein", "casein beta", "casein b", "casein b (aa 1-183) pgpk48",
1408 "mature beta-casein (aa 1 to 226)");
1409 $synonym{$_} = "k-casein" foreach ("k-casein", "kappa casein", "kappa-casein", "kappa-casein precursor", "kappa casein precursor",
1410 "casein kappa", "kappa-cas", "kappa-casein long form", "kappa-casein mature peptide",
1411 "kappa-casein short form");
1412 $synonym{$_} = "sry" foreach ("sry", "sex determining factor sry", "sex determining region y protein", "sex-determining factor sry",
1413 "sex-determining region y", "sry gene", "sry protein", "sex region of y chromosome");
1414 $synonym{$_} = "c-mos" foreach ("c-mos", "oocyte maturation factor mos", "mos", "mos protein");
1415 $synonym{$_} = "cryaa" foreach ("cryaa", "crya1", "alpha a-crystallin","alpha a-crystallin", "alphaa-crystallin",
1416 "alpha-a crystallin chain", "alpha-a-crystallin", "crystallin, alpha a", "alphaa-crystallin (crya1)",
1417 "alpha a crystallin");
1418 $synonym{$_} = "cryab" foreach ("cryab", "crya2", "alpha b-crystallin","alpha b-crystallin", "alphab-crystallin",
1419 "alpha-b crystallin chain", "alpha-b-crystallin", "crystallin, alpha b", "alphab-crystallin (crya2)",
1420 "alpha b crystallin");
1421 $synonym{$_} = "t-cell_receptor_beta" foreach ("t-cell receptor beta", "t cell receptor beta", "t-cell receptor beta chain",
1422 "t cell receptor beta chain", "t-cell receptor beta chain variable region",
1423 "t cell receptor beta chain variable region", "t-cell receptor beta chain variable",
1424 "t cell receptor beta chain variable", "t-cell receptor variable beta chain",
1425 "t cell receptor variable beta chain", "t-cell receptor beta-chain",
1426 "t cell receptor beta-chain", "t-cell receptor beta chain vj region",
1427 "t cell receptor beta chain vj region", "t-cell receptor beta chain v-d-j region",
1428 "t cell receptor beta chain v-d-j region", "t-cell receptor beta chain variable segment",
1429 "t cell receptor beta chain variable segment", "t-cell receptor beta chain constant region",
1430 "t cell receptor beta chain constant region", "t-cell receptor v beta gene",
1431 "t cell receptor v beta gene", "t-cell receptor-beta chain", "t cell receptor-beta chain",
1432 "t cell receptor bata chain", "t-cell receptor beta-chain", "t cell receptor beta-chain",
1433 "t-cell receptor beta chain (v-d-j-c)", "t cell receptor beta chain (v-d-j-c)",
1434 "t-cell receptor beta-chain v region", "t cell receptor beta-chain v region",
1435 "t-cell receptor v region beta-chain", "t cell receptor v region beta-chain",
1436 "t-cell receptor beta chain vdj region", "t cell receptor beta chain vdj region",
1437 "t-cell receptor beta chain v-region", "t cell receptor beta chain v-region",
1438 "t-cell receptor v-beta", "t cell receptor v-beta", "t-cell_receptor_beta");
1439 $synonym{$_} = "t-cell_receptor_alpha" foreach ("t-cell receptor alpha", "t cell receptor alpha", "t-cell receptor alpha chain",
1440 "t cell receptor alpha chain", "t-cell receptor alpha chain variable region",
1441 "t cell receptor alpha chain variable region", "t-cell receptor alpha chain variable",
1442 "t cell receptor alpha chain variable", "t-cell receptor variable alpha chain",
1443 "t cell receptor variable alpha chain", "t-cell receptor alpha-chain",
1444 "t cell receptor alpha-chain", "t-cell receptor alpha chain vj region",
1445 "t cell receptor alpha chain vj region", "t-cell receptor alpha chain v-d-j region",
1446 "t cell receptor alpha chain v-d-j region", "t-cell receptor alpha chain variable segment",
1447 "t cell receptor alpha chain variable segment", "t-cell receptor alpha chain constant region",
1448 "t cell receptor alpha chain constant region", "t-cell receptor v alpha gene",
1449 "t cell receptor v alpha gene", "t-cell receptor-alpha chain", "t cell receptor-alpha chain",
1450 "t cell receptor bata chain", "t-cell receptor alpha-chain", "t cell receptor alpha-chain",
1451 "t-cell receptor alpha chain (v-d-j-c)", "t cell receptor alpha chain (v-d-j-c)",
1452 "t-cell receptor alpha-chain v region", "t cell receptor alpha-chain v region",
1453 "t-cell receptor v region alpha-chain", "t cell receptor v region alpha-chain",
1454 "t-cell receptor alpha chain vdj region", "t cell receptor alpha chain vdj region",
1455 "t-cell receptor alpha chain v-region", "t cell receptor alpha chain v-region",
1456 "t-cell receptor v-alpha", "t cell receptor v-alpha", "t-cell_receptor_alpha");
1457
1458 # tRNAs
1459 my @aminoAcids = qw(gly ala val leu ile met phe trp pro ser thr cys tyr asn gln asp glu lys arg his);
1460 my %AAcodes = ('glycine' => 'gly', 'alanine' => 'ala', 'valine' => 'val', 'leucine' => 'leu', 'isoleucine' => 'ile',
1461 'methionine' => 'met', 'phenylalanine' => 'phe', 'tryptophan' => 'trp', 'proline' => 'pro',
1462 'serine' => 'ser', 'threonine' => 'thr', 'cysteine' => 'cys', 'tyrosine' => 'tyr', 'asparagine' => 'asn',
1463 'glutamine' => 'gln', 'aspartic acid' => 'asp', 'glutamic acid' => 'glu', 'lysine' => 'lys',
1464 'arginine' => 'arg', 'histidine' => 'his');
1465 my %fullName = ('gly' => 'glycine', 'ala' => 'alanine', 'val' => 'valine', 'leu' => 'leucine', 'ile' => 'isoleucine',
1466 'met' => 'methionine', 'phe' => 'phenylalanine', 'trp' => 'tryptophan', 'pro' => 'proline',
1467 'ser' => 'serine', 'thr' => 'threonine', 'cys' => 'cysteine', 'tyr' => 'tyrosine', 'asn' => 'asparagine',
1468 'gln' => 'glutamine', 'asp' => 'aspartic acid', 'glu' => 'glutamic acid', 'lys' => 'lysine',
1469 'arg' => 'arginine', 'his' => 'histidine');
1470
1471 foreach my $aa (@aminoAcids)
1472 {
1473 # Abbreviations
1474 $synonym{"trna $aa"} = "trna-$aa";
1475 $synonym{"transfer rna $aa"} = "trna-$aa";
1476 $synonym{"transfer rna-$aa"} = "trna-$aa";
1477 $synonym{"trna($aa)"} = "trna-$aa";
1478 $synonym{"trna $aa gene"} = "trna-$aa";
1479 $synonym{"trna-$aa gene"} = "trna-$aa";
1480 $synonym{"$aa trna"} = "trna-$aa";
1481
1482 my $dashedName = $aa."-trna";
1483 $synonym{"$dashedName"} = "trna-$aa";
1484
1485 # And again for full names
1486 $synonym{"trna $fullName{$aa}"} = "trna-$aa";
1487 $synonym{"transfer rna $fullName{$aa}"} = "trna-$aa";
1488 $synonym{"transfer rna-$fullName{$aa}"} = "trna-$aa";
1489 $synonym{"trna-$fullName{$aa}"} = "trna-$aa";
1490 $synonym{"trna($fullName{$aa})"} = "trna-$aa";
1491 $synonym{"trna $fullName{$aa} gene"} = "trna-$aa";
1492 $synonym{"trna-$fullName{$aa} gene"} = "trna-$aa";
1493 $synonym{"$fullName{$aa} trna"} = "trna-$aa";
1494
1495 $dashedName = $fullName{$aa}."-trna";
1496 $synonym{"$dashedName"} = "trna-$aa";
1497 }
1498 }
1499
1500 sub geneClean
1501 {
1502 my $dirtyGene = shift;
1503
1504 # Initial synonymizing
1505 $dirtyGene = lc($dirtyGene); # Only work with lower-case for synonymizing and file names
1506 $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene});
1507
1508 # Clean end of gene name
1509 $dirtyGene =~ s/\"$//g;
1510 $dirtyGene =~ s/\s+$//g;
1511 $dirtyGene =~ s/^\s+//g;
1512
1513 # Remove punctuation that will confound file saving
1514 $dirtyGene =~ s/^'//g;
1515 $dirtyGene =~ s/'$//;
1516 $dirtyGene =~ s/:/ /g;
1517 $dirtyGene =~ s/, / /g;
1518 $dirtyGene =~ s/,//g;
1519 $dirtyGene =~ s/\*/-/g;
1520 $dirtyGene =~ s/\$/-/g;
1521 $dirtyGene =~ s/\#/-/g;
1522 $dirtyGene =~ s/\&/and/g;
1523 $dirtyGene =~ s/\//-/g;
1524 $dirtyGene =~ s/\\//g;
1525 $dirtyGene =~ s/\|/-/g;
1526 $dirtyGene =~ s/;//g;
1527 $dirtyGene =~ s/\<//g;
1528 $dirtyGene =~ s/\>//g;
1529 $dirtyGene =~ s/-+/-/g;
1530 $dirtyGene =~ s/\s+-/-/g;
1531 $dirtyGene =~ s/-\s+/-/g;
1532 $dirtyGene =~ s/^-+//;
1533 $dirtyGene =~ s/`/'/;
1534
1535 # Collapse multiple whitespace
1536 $dirtyGene =~ s/\s+/_/g;
1537
1538 # Clean up some tRNA variants (easier than specifying explicit synonyms for each tRNA)
1539 if ($dirtyGene =~ /tRNA/i)
1540 {
1541 $dirtyGene =~ s/_\d+$//;
1542 $dirtyGene =~ s/ \d+$//;
1543 }
1544
1545 # Final synonymizing
1546 $dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene}); # Recheck for synonym
1547 $dirtyGene = lc($dirtyGene); # Ensure that gene names are in lower case for further synonymizing and file saving (final safety)
1548
1549 return $dirtyGene;
1550 }
1551
1552 sub geneCount
1553 {
1554 # Set local parameters
1555 my $gbFile = shift;
1556
1557 my (%iGene, @geneList, $speciesNum, %quickSpeciesGenePresent, %speciesCounter);
1558 my $wordBlock = 1;
1559 my $modelBlock = 0;
1560 my %modelSpecies;
1561 my $stripCount = 0;
1562
1563 print "\nCounting gene occurrences in file $gbFile to establish genes that do not meet user-defined threshold(s)\n";
1564
1565 # Search input file
1566 setLineBreak($gbFile);
1567 open (GENE, "<$gbFile") or die "Cannot open GenBank output file, $gbFile.\n";
1568 my $accCount = 0;
1569 my $modelFlag = 0;
1570 my $speciesFlag = 0;
1571 my $nameFlag = 0;
1572 my $countZero = time;
1573 my ($organism, $gene);
1574 while (<GENE>)
1575 {
1576 next unless ($_ =~ /LOCUS/ or $_ =~ /^\s*(\/gene)|(\/product)=\"/ or $_ =~ /^\s*\/organism=\"/ or $nameFlag == 1);
1577 chomp;
1578 my $gbLine = $_;
1579
1580 if ($gbLine =~ /LOCUS/)
1581 {
1582 undef %iGene;
1583 undef $organism;
1584 $modelFlag = $speciesFlag = $nameFlag = 0;
1585 $accCount++;
1586 print "\t$accCount sequences read in\n" if ($accCount == int($accCount/10000)*10000 and $verbose);
1587 next;
1588 }
1589
1590 # Get organism name
1591 if ($gbLine =~ /^\s*\/organism=\"(.+)\"/)
1592 {
1593 $organism = $1;
1594 $organism =~ s/\s/_/g;
1595 $modelFlag = 1 if (defined $modelSpecies{$organism});
1596 $speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./);
1597 unless ($speciesFlag or ($modelFlag and $modelBlock))
1598 {
1599 $speciesCounter{$organism}++;
1600 $speciesNum++ if ($speciesCounter{$organism} == 1);
1601 }
1602 }
1603 next if ($modelFlag and $modelBlock); # Entry pertains to model organism; skip parsing rest of entry
1604
1605 # Get gene name
1606 if ($gbLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/) # Get start of gene name
1607 {
1608 $gene = $1 if ($gbLine =~ /=\"(.+)/);
1609
1610 # Check whether name wraps onto a new line
1611 if (substr($gbLine, -1) ne "\"")
1612 {
1613 $nameFlag = 1;
1614 next;
1615 }
1616 else
1617 {
1618 $nameFlag = 2;
1619 $gene =~ s/\"$// if ($gene =~ /\"$/);
1620 }
1621 }
1622
1623 if ($nameFlag == 1) # Get continuation / end of gene name
1624 {
1625 $gene .= $gbLine;
1626 $nameFlag = 2 if ($gbLine =~ /\"$/) # Gene name is complete
1627 }
1628
1629 if ($gene and $nameFlag == 2) # Process complete gene
1630 {
1631 next if (not defined $organism);
1632 unless ($wordBlock and ($gene =~ /hypothetical/i or $gene =~ /open reading frame/i or $gene =~ /similar/i or $gene =~ /homolog/i or $gene =~ /putative/i or $gene =~ /unknown/i or $gene =~ /unnamed/i or $gene =~ /\d+rik/i or $gene =~ /possible/i or $gene =~ /pseudo/i))
1633 {
1634 $gene = geneClean($gene); # Clean up and synonymize gene name
1635
1636 # Increment counters
1637 $iGene{$gene}++; # Sequence counter
1638 $quickSequenceCount{$gene}++ if ($iGene{$gene} == 1);
1639
1640 $quickSpeciesGenePresent{$gene}{$organism}++; # Species-gene counter
1641 $quickSpeciesCount{$gene}++ if ($quickSpeciesGenePresent{$gene}{$organism} == 1);
1642 }
1643 $nameFlag = 0;
1644 undef $gene;
1645 }
1646 }
1647 close GENE;
1648 print "\n" if ($verbose);
1649 my $countTime = time - $countZero;
1650
1651 # Print results and block genes as appropriate
1652 open (OUT, ">$countFile") or die "Cannot open results file for quick gene count, $countFile.\n";
1653 if ($speciesThreshold)
1654 {
1655 @geneList = sort {$quickSpeciesCount{$b} <=> $quickSpeciesCount{$a} } keys %quickSpeciesCount;
1656 }
1657 else
1658 {
1659 @geneList = sort {$quickSequenceCount{$b} <=> $quickSequenceCount{$a} } keys %quickSequenceCount;
1660 }
1661
1662 printf OUT "Searching $accCount sequences took $countTime seconds with %s distinct genes found\n", scalar(@geneList);
1663 print OUT "\tModel organisms were excluded from search\n" if ($modelBlock);
1664 print OUT "\tCommon words were excluded from search\n" if ($wordBlock);
1665 print OUT "\nGene\tNo. of sequences\tNo. of species\n\n";
1666
1667 print "\tGene counts:\n" if ($debug);
1668 foreach my $entry (@geneList)
1669 {
1670 print OUT "$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n";
1671 print "\t$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n" if ($debug);
1672 if ($quickSequenceCount{$entry} < $seqThreshold or $quickSpeciesCount{$entry} < $speciesThreshold)
1673 {
1674 $geneStatus{$entry} = "rejected";
1675 push @rejectList, $entry;
1676 }
1677 else
1678 {
1679 $stripCount++;
1680 }
1681 }
1682 close OUT;
1683
1684 printf "\tSearching $accCount sequences took $countTime seconds with %s distinct genes found; results written to $countFile\n", scalar(@geneList);
1685 if (not $stripCount)
1686 {
1687 print "No genes were present more than the threshold value(s)\n";
1688 exit(0);
1689 }
1690 }
1691
1692 # Version history
1693 #
1694 # v2.0 (July 25, 2005)
1695 # - improved and more complete parsing of GenBank files (esp. of gene names or CDs covering multiple lines,
1696 # join statements and other multi-segment entries, detection of pseudogenes)
1697 # - added ability to mine input file for all gene content
1698 # - added ability to strip only those genes present for 1) a minimum number of sequences and/or species and
1699 # 2) species with valid names only
1700 # - added (incomplete) gene synonymy list (initially compiled by Robin Beck)
1701 # - minor bug fixes
1702 #
1703 # v1.0.1a (April 29, 2004)
1704 # - added automatic detection of line breaks
1705 # - minor bug fixes
1706 #
1707 # v1.0 (April 30, 2002)
1708 # - initial release