view getdata/genbankstrip.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
#
# Modified for Osiris by THO.
# 
#
# GenBankStrip.pl v2.0
# Last modified July 25, 2005 14:29
# (c) Olaf R.P. Bininda-Emonds
#
# Input:
#   1) A GenBank output file
#   2) A file containing gene names, each on a separate line
#
# Generates:
#   1) A Se-Al formatted (and optionally a nexus-formatted) datafile containing stripped sequences
#	2) A summary file giving status of each entry in GenBank file
#
# Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]
#   options: -f<filename> = file containing sequences in GenBank format
#            -g<filename> = file containing specific genes to be stripped
#            -i<genename> = strip a single gene
#            -k<number> = number of (longest) sequences to retain per species for a given gene (default = all)
#            -l<number> = minimum length required for all non-tRNA genes (default = none)
#            -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format
#            -s = only process sequences with valid species names (i.e., no species with sp. or cf. in names)
#            -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0) a gene must have to be retained; both can be specified simultaneously
#            -h = print this message and quit
#            -v = verbose output
#	     -a = Accession number appears first in stripped Fasta File -- Added by THO
use strict;

# Set default values
	# Genes and sequences
		my %synonym;
		my %complement = ('A' => 'T', 'C' => 'G', 'G' => 'C', 'T' => 'A',
						 'M' => 'K', 'R' => 'Y', 'W' => 'S', 'S' => 'W', 'Y' => 'R', 'K' => 'M',
						 'B' => 'V', 'D' => 'H', 'H' => 'D', 'V' => 'B', 'N' => 'N', '-' => '-', '?' => '?');
		my %maxLength;
	
	# I/O
		my $gbFile;
			my ($countFile, $rejectFile, $stripFile);
		my $geneFile;
			my %userGene;
		my $singleGene;
		my $keepGene = 0;
		my $minLength;
			my $minLengthTRNA;
		my $speciesBlock = 0;
		my $seqThreshold = 0;
		my $speciesThreshold = 0;
		my $sealPrint = 0;
		my $nexusPrint = 0;
		my $phytabPrint = 0;
		
	# Global stats
		my (@globalGeneList, %globalGenePresent, %geneStatus, @rejectList, $stripCount);
		my (@speciesList, %speciesPresent, %speciesGenePresent, %quickSpeciesCount, %speciesCount, %quickSequenceCount, %sequenceCount);
				$sequenceCount{"all"} = 0;

	# Miscellaneous
		my $debug = 0;
		my $verbose = 0;
		my $accessionFirst = 0; #Added by THO as option to have accession # first in FASTA output

for (my $i = 0; $i <= $#ARGV; $i++)
	{
#Original stripped out directory structure, required by Galaxy
#	if ($ARGV[$i] =~ /^-f([\w+\.?\w+?]*)/)
	if ($ARGV[$i] =~ /^-f(.+)/)
		{
		$gbFile = $1;
		
#		(my $baseFile = $gbFile) =~ s/.\w+$//;
#			$countFile .= $baseFile . "_geneCount.txt";
#			$rejectFile .= $baseFile . "_gbs.rejectlist.txt";
#			$stripFile .= $baseFile . "_gbs.striplist.txt";
#		}
#for Galaxy, easier to have same file name each time it'r run
		(my $baseFile = $gbFile) =~ s/.\w+$//;
			$countFile .= "geneCount.txt";
			$rejectFile .= "rejectlist.txt";
			$stripFile .= "striplist.txt";
		}
	elsif ($ARGV[$i] =~ /^-g([\w+\.?\w+?]*)/)
		{
		$geneFile = $1;

#This is a hack to write the name of a single gene into a new genefile based on the Galaxy call

open (GENE,">$geneFile") or die "Cannot open file $geneFile containing gene names.\n";
print GENE "$geneFile\n";
close GENE;
		}
	elsif ($ARGV[$i] =~ /^-i([\w+\.?\w+?]*)/)
		{
		$singleGene = $1;
		}
	elsif ($ARGV[$i] =~ /^-k(\d+)/)
		{
		$keepGene = $1;
		}
	elsif ($ARGV[$i] =~ /^-l(\d+)/)
		{
		$minLength = $1;
		$minLengthTRNA = 50;
		}
	elsif ($ARGV[$i] =~ /^-on/)
		{
		$nexusPrint = 1;
		}
	elsif ($ARGV[$i] =~ /^-op/)
		{
		$phytabPrint = 1;
		}
	elsif ($ARGV[$i] =~ /^-os/)
		{
		$sealPrint = 1;
		}
	elsif ($ARGV[$i] =~ /^-s/)
		{
		$speciesBlock = 1;
		}
	elsif ($ARGV[$i] =~ /^-tg(\d+)/)
		{
		$seqThreshold = $1;
		}
	elsif ($ARGV[$i] =~ /^-ts(\d+)/)
		{
		$speciesThreshold = $1;
		}
	elsif ($ARGV[$i] =~ /^-v/)
		{
		$verbose = 1;
		}
	elsif ($ARGV[$i] =~ /^-a/)
		{
		$accessionFirst = 1;
		}
	elsif ($ARGV[$i] =~ /^-x/)
		{
		$debug = 1;
		$verbose = 1;
		}
	elsif ($ARGV[$i] =~ /^-h/)
		{
		print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n";
		print "Options: -f<filename> = file containing sequences in GenBank format\n";
		print "         -g<filename> = file containing specific genes to be stripped\n";
		print "         -k<number> = number of (longest) sequences to retain per species for a given gene (default = all)\n";
		print "         -l<number> = minimum length required for all non-tRNA genes (default = none)\n";
		print "         -o<n|s> = provide output in nexus (n) phytab (p) and/or Se-Al (s) format in addition to fasta format\n";
		print "         -s = only process sequences for valid species names (i.e., no species with sp. or cf. in names)\n";
		print "         -t<g<number>|s<number>> = minimum number of sequences (g; default = 0) or species (s; default = 0)\n";
		print "                                   a gene must have to be retained; both can be specified simultaneously\n";
		print "         -h = print this message and quit\n";
		print "         -v = verbose output\n";
		exit(0);
		}
	else
		{
		print "Don't understand argument: $ARGV[$i]\n";
		print "Usage: GenBankStrip.pl -f<filename> [-g<filename>] [-k<number>] [-l<number>] [-o<n|s>] [-s] [-t] [-h] [-v]\n";
		exit(1); 
		}
	}

die "ERROR: Must supply name of GenBank output file using flag -f.\n" if (not $gbFile);

# Load in hardwired gene synonyms
	geneSynonyms();

# Get list of target genes (if desired)
	if ($geneFile)
		{
		my $userGeneCount = 0;
		my %userGenePresent;
		setLineBreak($geneFile);
		open (GENE,"<$geneFile") or die "Cannot open file $geneFile containing gene names.\n";
		print "Gene(s) to be stripped:\n";
		while (<GENE>)
			{
			chomp;
			next unless ($_);
			my $gene = $_;
			$gene = $synonym{$gene} if (defined $synonym{$gene});
			$userGene{$gene} = 1;
			unless ($userGenePresent{$gene})
				{
				$userGeneCount++;
				$userGenePresent{$gene}++;
				print "\t$gene\n";
				}
			}
		close GENE;

#		die "ERROR: No genes read in from file $geneFile\n";
		}
#THO added -h command to easilly pass a single gene for Galaxy
	if($singleGene)
		{
		my $userGeneCount = 1;
		my %userGenePresent;
		print "Gene(s) to be stripped:\n";
		my $gene = $singleGene;
		$gene = $synonym{$gene} if (defined $synonym{$gene});
		print "\t$gene\n";
		$geneFile = $singleGene;
		}

# Print parameter summary
	print "The following parameters have been set by the user:\n";
	print "\tFile containing GenBank sequence data: $gbFile\n";
	print "\tFile containing target genes to be stripped: $geneFile\n" if ($geneFile);
	print "\tUser-defined constraints(s):\n";
		print "\t\tNumber of sequences: $seqThreshold\n" if ($seqThreshold);
		print "\t\tNumber of species: $speciesThreshold\n" if ($speciesThreshold);
		print "\t\tMinimum sequence length: global - $minLength bp; tRNAs - $minLengthTRNA bp\n" if (defined $minLength);
		print "\t\tOnly using species with valid names\n" if ($speciesBlock);
		print "\t\tNone\n" if (not $seqThreshold and not $speciesThreshold and not defined $minLength and not $speciesBlock);
	print "\tNumber of sequences to keep per species for each gene: ";
		if ($keepGene)
			{
			print "$keepGene\n";
			}
		else
			{
			print "all\n";
			}
	print "\tOutput format(s): fasta";
		print ", Se-Al" if ($sealPrint);
		print ", nexus" if ($nexusPrint);
		print ", phytab" if ($phytabPrint);
		print "\n";

# Do quick gene count if thresholds indicated; takes longer but 1) saves many disk operations and 2) less memory intensive
	geneCount($gbFile) if (not defined $geneFile and ($seqThreshold or $speciesThreshold));	# Don't bother if user gene list given; will usually be small enough so that benefits won't come into play
	
# Read in GenBank file and strip genes
	my $stripZero = time;
	print "\nProcessing GenBank file $gbFile ...\n";
	setLineBreak($gbFile);
	open (DATA, "<$gbFile") or die "Cannot open GenBank output file $gbFile\n";
		my @allAccNum;
		my ($accNum, %accRead, $duplEntry, $organism, %species, $geneName);
			my $speciesFlag = 0;
		my (%genePresent, @geneList);
		my (@startList, @stopList, $complementFlag, $typeStatus, %pseudoStatus, %seqType, $joinLine, @joinSegments, %geneStart, %geneStop, %compStatus, $fullSeq);
			my $nameFlag = 0;
			my $joinFlag = 0;
			my $pseudoFlag = 0;
			my $readSeqs = 0;

		while (<DATA>)
			{
			chomp;
			my $readLine = $_;
			next if ($readLine =~ /^\s*LOCUS/ or $readLine =~ /^\s*DEFINITION/ or $readLine =~ /^\s*VERSION/ or $readLine =~ /^\s*KEYWORDS/ or $readLine =~ /^\s*SOURCE/ or $readLine =~ /^\s*ORGANISM/ or $readLine =~ /^\s*REFERENCE/ or $readLine =~ /^\s*AUTHORS/ or $readLine =~ /^\s*TITLE/ or $readLine =~ /^\s*JOURNAL/ or $readLine =~ /^\s*MEDLINE/ or $readLine =~ /^\s*PUBMED/ or $readLine =~ /^\s*FEATURES/ or $readLine =~ /^\s*COMMENT/ or $readLine =~ /^\s*BASE COUNT/ or $readLine =~ /^\s*source/ or $readLine =~ /^\s*\/codon/ or $readLine =~ /^\s*\/transl/ or $readLine =~ /^\s*\/db_/ or $readLine =~ /^\s*CONTIG/);
		
			# Get accession number
				if ($readLine =~ /^\s*ACCESSION\s+(.+)/)
					{
					$accNum = $1;
					print "$readLine\n" if ($debug);
					# Clear variables
						undef @geneList;
						undef %genePresent;
						undef $fullSeq;
						undef @startList;
						undef @stopList;
						undef %geneStart;
						undef %geneStop;
						undef %pseudoStatus;
						undef $geneName;
						$speciesFlag = $nameFlag = $joinFlag = $pseudoFlag = $readSeqs = $duplEntry = 0;

					if (not $accRead{$accNum})	# Check for duplicate entries
						{
						$accRead{$accNum} = 1;
						push @allAccNum, $accNum;
						if (scalar(@allAccNum) == int(scalar(@allAccNum)/10000)*10000 and $verbose)
							{
							print "\tSequences read in: ".scalar(@allAccNum)."\n";
							}
						print "\tAccession number: $accNum\n" if ($debug);
						}
					else
						{
print "*****NOTE --  DUPLICATE ENTRY Accession $accNum\n";
						$duplEntry = 1;
						}
					}

			# Get organism name
				if ($readLine =~ /^\s*\/organism=\"(.+)\"/)
					{
					$organism = $1;
						$organism =~ s/\s/_/g;
					print "$readLine\n" if ($debug);
					$species{$accNum} = $organism;
					$speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./ or $organism =~ /_X_/i);
					if ($debug)
						{
						print "\t\tOrganism: $organism";
						print " (blocked)" if ($speciesFlag and $speciesBlock);
						print "\n";
						}
					}
				next if ($speciesFlag and $speciesBlock);	# Entry pertains to invalid species name; skip parsing rest of entry

			
			# Get gene boundaries; process previous set of CDs
				if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/ or $joinFlag == 1 or $readLine =~ /^\s*ORIGIN/)	# ORIGIN will process last set of CDs
					{
					next if ($readLine =~ /^\s+\/\w+/);	# Prevents spurious matches with lines beginning with "/feature"
					$readSeqs = 1 if ($readLine =~ /^\s*ORIGIN/);	# Indicates that remaining lines will contain sequence information
					
					# Process previous gene; need to do here to account for a posteriori declarations of pseudogene status
						if ($geneName and $nameFlag == 2 and @startList and @stopList)	# Process complete gene
							{
							print "\t\t\t\tParsed name: $geneName (type: $typeStatus)\n" if ($debug);
							
							# Clean up gene name and misleading punctuation
								$geneName = geneClean($geneName);
								$pseudoStatus{$geneName} = 1 if ($pseudoFlag);
								
							if (defined @ { $geneStart{$geneName} } and ((defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or $typeStatus =~ /intron/i or $typeStatus =~ /UTR/i))	# Gene has previously stored CDs that might not have been recognized as a pseudogene or non-coding region
								{
								print "\t\t\t\t\tSubsequent occurrence of now recognized pseudogene or non-coding region; comparing new and stored CDs\n" if ($debug);
								for (my $i = 0; $i < scalar(@startList); $i++)	# Check each occurrence in new CDs for matches in stored ones
									{
									my $newStart = $startList[$i];
									my $newStop = $stopList[$i];
									print "\t\t\t\t\t\tChecking new CDs $newStart to $newStop\n" if ($debug);
									for (my $j = 0; $j < scalar(@ { $geneStart{$geneName} }); $j++)
										{
										if ($newStart == $geneStart{$geneName}[$j] and $newStop == $geneStop{$geneName}[$j])
											{
											print "\t\t\t\t\t\t\tMatch with stored CDs (no. $j); deleted\n" if ($debug);
											splice(@ { $geneStart{$geneName} }, $j, 1);
											splice(@ { $geneStop{$geneName} }, $j, 1);
											}
										}
									}
								if ($debug)
									{
									print "\n\t\t\t\t\tCurrent gene boundaries after pseudogene / non-coding check (type = $seqType{$geneName}):\n";
									if (scalar(@ { $geneStart{$geneName} }) < 1)
										{
										print "\t\t\t\t\t\tnone\n";
										}
									else
										{
										for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++)
											{
											print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n";
											}
										}
									}
								}
														
							# Only process coding regions of user-desired genes (if applicable), genes with sensible CDs, non-blocked genes, and genes that are not obvious singletons or pseudogenes
								unless (($geneFile and not defined $userGene{$geneName}) or
									(defined $geneStatus{$geneName} and $geneStatus{$geneName} eq "rejected") or
									(defined $pseudoStatus{$geneName} and $pseudoStatus{$geneName} == 1) or
									$geneName =~ /hypothetical/i or $geneName =~ /open reading frame/i or $geneName =~ /similar/i or $geneName =~ /homolog/i or $geneName =~ /putative/i or $geneName =~ /unknown/i or $geneName =~ /unnamed/i or $geneName =~ /\d+rik/i or $geneName =~ /possible/i or $geneName =~ /pseudo/i
									or $typeStatus =~ /UTR/i or $typeStatus =~ /intron/i or $typeStatus =~ /misc_feature/i)
									{
									if (not defined $genePresent{$geneName})	# Process first occurrence of gene in entry
										{
										if ($debug)
											{
											print "\t\t\t\t\tFirst occurrence of $geneName\n" if ($debug);
											for (my $i = 0; $i < scalar(@startList); $i++)
												{
												print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n";
												}
											}
										$genePresent{$geneName} = 1;
										push @geneList, $geneName;
											push @ { $geneStart{$geneName} }, $_ foreach (@startList);
											push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
			
										$seqType{$geneName} = $typeStatus;
										$compStatus{$geneName} = 0;	# Note whether gene is complemented or not
											$compStatus{$geneName} = 1 if ($complementFlag);
										}
									else	# Attempt to add secondary occurrences
										{
										my $storedSegments = scalar(@ { $geneStop{$geneName} }) - 1;
										my $lastStop = $geneStop{$geneName}[$storedSegments];
											$lastStop = 0 if (not defined $lastStop);
										my $newStart = $startList[0];
			
										if ($debug)
											{
											print "\t\t\t\t\tSecondary occurrence of $geneName with boundaries\n" if ($debug);
											for (my $i = 0; $i < scalar(@startList); $i++)
												{
												print "\t\t\t\t\t\t$startList[$i]\t$stopList[$i]\n";
												}
											}
										if ($seqType{$geneName} eq "gene" and $typeStatus ne "gene")	# New information probably more precise and better accounts for structure
											{
											print "\n\t\t\t\t\t\tNew segment more precisely defined by type; replaced\n" if ($debug);
											undef @ { $geneStart{$geneName} };
											undef @ { $geneStop{$geneName} };
											push @ { $geneStart{$geneName} }, $_ foreach (@startList);
											push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
											$seqType{$geneName} = $typeStatus;
											}
										
										elsif ($newStart > $lastStop)	# New segment occurs distal to last stored segment (could also be contiguous); append to boundaries
											{
											print "\n\t\t\t\t\t\tContiguous with or occurs after last stored segment; appended\n" if ($debug);
											push @ { $geneStart{$geneName} }, $_ foreach (@startList);
											push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
											$seqType{$geneName} = "composite";
											}
										elsif (scalar(@ { $geneStart{$geneName} }) == 1 and scalar(@startList) > 1)	# Replace single stored segment with new segments derived from join statement; probably better accounts for intron/exon structure
											{
											print "\n\t\t\t\t\t\tNew segments subdivide single stored segment; replaced\n" if ($debug);
											undef @ { $geneStart{$geneName} };
											undef @ { $geneStop{$geneName} };
											push @ { $geneStart{$geneName} }, $_ foreach (@startList);
											push @ { $geneStop{$geneName} }, $_ foreach (@stopList);
											$seqType{$geneName} = "composite";
											}
										else
											{
											print "\n\t\t\t\t\t\tNew segment overlaps stored segments; rejected\n" if ($debug);
											}
										}
									if ($debug)
										{
										print "\n\t\t\t\t\tCurrent gene boundaries after CDS processing (type = $seqType{$geneName}):\n";
										if (scalar(@ { $geneStart{$geneName} }) < 1)
											{
											print "\t\t\t\t\t\tnone\n";
											}
										else
											{
											for (my $i = 0; $i < scalar(@ { $geneStart{$geneName} }); $i++)
												{
												print "\t\t\t\t\t\t$geneStart{$geneName}[$i]\t$geneStop{$geneName}[$i]\n";
												}
											}
										}
									}

							# Clear entries for gene
								$nameFlag = $complementFlag = 0;
								undef @startList;	# Prevents double listing of genes if CDS use both /gene and /product tags
								undef @stopList;
								undef $typeStatus;
								$pseudoStatus{$geneName} = 0;	# Clear pseudogene status for previous set of CDs
							}
						next if ($readLine =~ /^\s*ORIGIN/);	# No more CDs to worry about
					
					print "$readLine\n" if ($debug);

					# Reset gene name and information for current set of CDs
						undef $geneName;
						$nameFlag = $pseudoFlag = $complementFlag = 0;
							$complementFlag = 1 if ($readLine =~ /complement/); #DISABLED BY THO BECAUSE ONLY COMPLEMENTS AND DOESN'T REVERSE

					# Get new CD information
						if ($readLine =~ /^\s+\S+\s+join\(/i or $readLine =~ /^\s+\S+\s+order\(/)
							{
							$joinFlag = 1;
							undef @startList;
							undef @stopList;
							undef $joinLine;
							}
						if ($readLine =~ /^\s+(\S+)\s+/i)
							{
							$typeStatus = $1;
							}
						
						if ($joinFlag)
							{
							$joinLine .= $readLine;
							if ($readLine =~ /\)$/)	# Have accounted for multiline join statements; process
								{
								$joinFlag = 0;
								$complementFlag = 1 if ($joinLine =~ /complement/);
								
								# Clean up join statement
									$joinLine =~ s/complement\(//;
									$joinLine =~ s/>//g;
									$joinLine =~ s/<//g;
									$joinLine =~ s/^\s+\S+\s+join\(//;
									$joinLine =~ s/^\s+\S+\s+order\(//;
									$joinLine =~ s/\)+$//;
									$joinLine =~ s/\s+//g;
									print "\t\tJoin boundaries: $joinLine\n" if ($debug);
									if ($joinLine =~ /[A-Z]+\d+/i)	# Avoid join commands referring to separate accessions
										{
										undef $joinLine;
										print "\t\t\tERROR: Joining accessions; skipping\n" if ($debug);
										next;
										}
								
								# Derive pieces of join statement
									@joinSegments = split(/,/, $joinLine);
									foreach my $segment (@joinSegments)
										{
										my ($start, $stop) = split(/\.\./, $segment);
										# Check safeties
											next unless ($start and $stop);
											next if ($start =~ /\D/ or $stop =~ /\D/);
											next if ($start > $stop);
										push @startList, $start;
										push @stopList, $stop;									
										}
								
								undef $joinLine;
								$typeStatus = "join";
								}
							}
						else
							{
							my ($start, $stop);
							if ($readLine =~ /\<?(\d+)\<?\.\.\>?(\d+)\>?/)
								{
								$start = $1;
								$stop = $2;
								}
							next unless ($start and $stop);
							next if ($start > $stop);
							undef @startList;
								push @startList, $start;
							undef @stopList;
								push @stopList, $stop;
							print "\t\tParsed boundaries: $1\t$2\n" if ($debug);
							}
					}


			# Check for pseudogene status
				if ($readLine =~ /\s+\/pseudo/ or $readLine =~ /pseudogene/ or $readLine =~ /putative/)
					{
					print "$readLine\n" if ($debug);
					$pseudoStatus{$geneName} = 1 if (defined $geneName);
					$pseudoFlag = 1;
					print "\t\t\t\tWARNING: suspected pseudogene or of uncertain (\"putative\") status\n" if ($debug);
					}

				if ($readLine =~ /codon recognized/ and defined $geneName and $geneName =~ /trna/i)
					{
					print "$readLine\n" if ($debug);
					$pseudoStatus{$geneName} = 1;
					$pseudoFlag = 1;
					print "\t\t\t\tWARNING: tRNA for an alternative codon; ignored\n" if ($debug);
					}

			# Get gene name
				if ($readLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/ and $nameFlag == 0)	# Get start of gene name
					{
					$geneName = $1 if ($readLine =~ /=\"(.+)/);
					print "$readLine\n" if ($debug);
					
					# Check whether name wraps onto a new line
						if (substr($readLine, -1) ne "\"")
							{
							$nameFlag = 1;
							next;
							}
						else
							{
							$nameFlag = 2;
							$geneName =~ s/\"$// if ($geneName =~ /\"$/);
							}
					}

				if ($nameFlag == 1)	# Get continuation / end of gene name
					{
					print "$readLine\n" if ($debug);
					$geneName .= $readLine;
					$nameFlag = 2 if ($readLine =~ /\"$/)	# Gene name is complete
					}
			
			# Read in sequence information and append
				if ($readLine =~ /^\s*\d*1\s\w+/ and @geneList and $readSeqs)
					{
					my $seqFrag = $readLine;
						$seqFrag =~ s/\s+//g;
						$seqFrag =~ s/\d+//g;
						$fullSeq .= uc($seqFrag);
					}

			# End of entry; process
				if ($readLine =~ /\/\//)
					{
					next if ($duplEntry);
					next unless (@geneList and defined $fullSeq);
					foreach my $gene (@geneList)
						{
						# Safeties; shouldn't come into play
							next if ($geneFile and not defined $userGene{$gene});
							next if (defined $geneStatus{$gene} and $geneStatus{$gene} eq "rejected");
							next if (defined $pseudoStatus{$gene} and $pseudoStatus{$gene} == 1);
						
						print "\n\t\tProcessing gene $gene\n" if ($debug);

						# Strip gene out of full sequence and format as fasta
							next unless (@ { $geneStart{$gene} } and @ { $geneStop{$gene} } );
							
							my $geneSeq;
								for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++)
									{
									print "\t\t\tGetting sequence between positions $geneStart{$gene}[$i] and $geneStop{$gene}[$i]\n" if ($debug);
									next if ($geneStart{$gene}[$i] > length($fullSeq) or $geneStop{$gene}[$i] > length($fullSeq));
									my $geneSegment = substr($fullSeq,$geneStart{$gene}[$i]-1,$geneStop{$gene}[$i]-$geneStart{$gene}[$i]+1);
										print "\t\t\t\t$geneSegment\n" if ($debug);
										$geneSeq .= $geneSegment;
									}

							next unless ($geneSeq);
							$geneSeq = complement($geneSeq) if ($compStatus{$gene});
							$maxLength{$gene} = length($geneSeq) if (not defined $maxLength{$gene} or length($geneSeq) > $maxLength{$gene});

							# Check if sequence matches length threshold (if appropriate)
								if (defined $minLength)
									{
									if ($gene =~ /^trna-\w\w\w/) 
										{
										if (length($geneSeq) < $minLengthTRNA)
											{
											printf "\t\t\tRejected: length (%s bp) does not meet tRNA length threshold ($minLengthTRNA bp)\n", length($geneSeq) if ($debug);
											next;
											}
										}
									else
										{
										if (length($geneSeq) < $minLength)
											{
											printf "\t\t\tRejected: length (%s bp) does not meet global threshold ($minLength bp)\n", length($geneSeq) if ($debug);
											next;
											}
										}
									}
								if ($gene =~ /trna-\w\w\w/ and length($geneSeq) > 100)
									{
									printf "\t\t\tRejected: length of tRNA too long (%s bp); indicates parsing failure\n", length($geneSeq) if ($debug);
									next;
									}

							my $breakPoint = 79;
							until ($breakPoint > length($geneSeq))
								{
								my $replaceString = "\n" . substr($geneSeq, $breakPoint, 1);
								substr($geneSeq, $breakPoint, 1) = $replaceString;
								$breakPoint += 80;
								}

						# Append sequence to file
							$geneStatus{$gene} = "stripped";
							$sequenceCount{$gene}++;
							$sequenceCount{$species{$accNum}}++;
							$sequenceCount{"all"}++;
							my $fastaFile = $gene."_gbs.fasta.txt";
							
							my $IOicon = ">";
								$IOicon .= ">" if ($sequenceCount{$gene} > 1);

							unless ($debug)


								{
								open (GENE, $IOicon, $fastaFile) or die "Cannot open file $fastaFile to write sequence to.";
									print GENE ">$species{$accNum} $accNum\n";
									print GENE "$geneSeq\n";
								close GENE;
								}

							# Update various statistical counters
								unless ($globalGenePresent{$gene})	# Gene counter
									{
									push @globalGeneList, $gene;
									$globalGenePresent{$gene} = 1;
									}

								unless ($speciesPresent{$species{$accNum}})	# Species counter
									{
									push @speciesList, $species{$accNum};
									$speciesPresent{$species{$accNum}} = 1;
									}

								$speciesGenePresent{$gene}{$species{$accNum}}++;	# Species-gene counter
									$speciesCount{$gene}++ if ($speciesGenePresent{$gene}{$species{$accNum}} == 1);
						
						# Print out summary stats for gene	
							if ($debug)
								{
								print "\t\t\tGene: $gene\n";
								printf "\t\t\t\tGene boundaries (%s bp)", length($geneSeq) - ($geneSeq =~ tr/\n//);
								print " (complemented)" if ($compStatus{$gene});
								print ":\n";
									for (my $i = 0; $i < scalar(@ { $geneStart{$gene} }); $i++)
										{
										print "\t\t\t\t\t$geneStart{$gene}[$i]\t$geneStop{$gene}[$i]\n";
										}
								print "\t\t\t\t\t$geneSeq\n";
								}
						}
					}
			}
close DATA;

# Print out summary stats
	print "\n\tSummary stats:\n";
	printf "\t\tTotal number of accessions processed: %s\n", scalar(@allAccNum);
	printf "\t\tTotal number of unique species: %s\n", scalar(@speciesList);
	printf "\t\tTotal number of unique sequences: %s\n", $sequenceCount{"all"};
	printf "\t\tTotal number of unique genes: %s\n", scalar(@globalGeneList);
	
	my $stripTime = time - $stripZero;
		print "\n\t\tTime taken to process file: $stripTime seconds\n";
		
	if (not @globalGeneList)
		{
		print  "\nNOTE: No genes stripped from file $gbFile; exiting\n";
		exit(0);
		}

# Reprocess results to 1) recheck whether stripped genes actually fall below user-set thresholds (geneCount procedure is a rough maximum), 2) pare down to required number of sequences per species, and 3) produce appropriate output files
	print "\nPostprocessing results ...\n";
		my $postZero = time;
		
	@globalGeneList = sort(@globalGeneList);
		$stripCount = scalar(@globalGeneList);
		
	foreach my $gene (@globalGeneList)
		{
		next if ($debug);
		print "\tProcessing gene $gene ($sequenceCount{$gene} sequences for $speciesCount{$gene} species)\n" if ($verbose);
		if ($sequenceCount{$gene} < $seqThreshold or $speciesCount{$gene} < $speciesThreshold)	# Gene does not meet threshold requirements; reject
			{
			$geneStatus{$gene} = "rejected";
				push @rejectList, $gene;
				$stripCount--;
			print "\t\tRejected; does not meet threshold requirements\n" if ($verbose);
			
			# Remove associated file
				my $rmString = $gene."_gbs.fasta.txt";
					$rmString =~ s/\s/\\ /g;
					$rmString =~ s/\(/\\(/g;
					$rmString =~ s/\)/\\)/g;
					$rmString =~ s/\'/\\'/g;
				system ("rm $rmString");
			}
		else
			{
			my (%infLength, %lengthList, %criticalLength);
			undef @speciesList;
			$geneStatus{$gene} = "stripped";
			if ($verbose)
				{
				print "\t\tStripped";
				print "; meets threshold requirements" if ($seqThreshold or $speciesThreshold);
				print "\n";
				}
			
			# Reload sequences from disk
				my $inputFile = $gene."_gbs.fasta.txt";
				setLineBreak($inputFile);
				open (FASTA, "<$inputFile") or die "Cannot open file $inputFile to read data for gene $gene\n";
					my (@accList, %speciesName, %fastaSeq, $species, $fastaAcc);

					undef %speciesPresent;
					while (<FASTA>)
						{
						chomp;
						if ($_ =~ "^>")
							{
 							($species, $fastaAcc) = split;
								$species =~ s/^>//;
								$fastaAcc =~ s/^\(//;
								$fastaAcc =~ s/\)$//;


							push @accList, $fastaAcc;
								$speciesName{$fastaAcc} = $species;
								$speciesPresent{$species}++;
								push @speciesList, $species if ($speciesPresent{$species} == 1);
							}
						else
							{
							$fastaSeq{$fastaAcc} .= $_;
							}
						}
				close FASTA;
				@accList = sort { $speciesName{$a} cmp $speciesName{$b} } keys %speciesName;
			
			# Pare down to desired number of sequences as needed
				if ($keepGene)
					{
					undef %lengthList;
					print "\t\tParing down to $keepGene best lengths for each of $speciesCount{$gene} species ...\n" if ($verbose);
					# Get informative length for each sequence
						foreach my $entry (@accList)
							{
							$infLength{$entry} = ($fastaSeq{$entry} =~ tr/ACGT//);
							push @ { $lengthList{$speciesName{$entry}} }, $infLength{$entry};
							}

					# Determine critical length and correct sequence count for number of deleted species
						foreach my $species (@speciesList)
							{
							print "\t\t\tSpecies: $species ($speciesPresent{$species} sequences)\n" if ($debug);
							next if ($speciesPresent{$species} <= $keepGene);	# Only process species for which gene has been sampled more than threshold
							@ { $lengthList{$species} } = sort {$b <=> $a} @ { $lengthList{$species} };	# Sort in descending order and get critical length
								$criticalLength{$species} = $lengthList{$species}[$keepGene-1];
	
							# Correct sequence count
								my $i = $keepGene;
									$i++ until ($i >= $speciesPresent{$species} or $lengthList{$species}[$i] < $criticalLength{$species});
									$sequenceCount{$gene} -= (scalar(@ { $lengthList{$species} }) - $i);
									
							if ($debug)
								{
								print "\t\t\t\tCritical length: $criticalLength{$species}\n";
								printf "\t\t\t\tNumber of sequences deleted: %s\n", scalar(@ { $lengthList{$species} }) - $i;
								print "\t\t\t\tTotal number of sequences for gene remaining: $sequenceCount{$gene}\n";
								}
							}
							
					print "\t\t\tFinal count: $sequenceCount{$gene} sequences\n" if ($verbose);
					
					# Recheck if paring has dropped sequence below sequence threshold
						if ($sequenceCount{$gene} < $seqThreshold)
							{
							print "\t\t\tNOTE: Gene ($sequenceCount{$gene} sequences) now falls below threshold of $seqThreshold sequences; no file created\n" if ($verbose);
							$geneStatus{$gene} = "rejected";
								push @rejectList, $gene;
								$stripCount--;

							# Remove associated file
								my $rmString = $gene."_gbs.fasta.txt";
									$rmString =~ s/\s/\\ /g;
									$rmString =~ s/\(/\\(/g;
									$rmString =~ s/\)/\\)/g;
									$rmString =~ s/\'/\\'/g;
								system ("rm $rmString");
							
							next;
							}
					}
			
			# Produce appropriate output files
				print "\t\tSaving output ...\n" if ($verbose);
				my %printCount;
				
				# Open files as needed
					my $sealFile = $gene."_gbs.seal";
						open (SEAL, ">$sealFile") or die "Cannot open Se-Al formatted file $sealFile for writing\n" if ($sealPrint);
					my $nexusFile = $gene."_gbs.nex";
						open (NEX, ">$nexusFile") or die "Cannot open nexus-formatted file $nexusFile for writing\n" if ($nexusPrint);
#					my $phytabFile = $gene."_gbs.phytab";
#						open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint);
#Currently assumes that only 1 file will be written for ease of Galaxy implementation
					my $phytabFile = "osiris_gbs.phytab";
						open (PHYTAB, ">$phytabFile") or die "Cannot open nexus-formatted file $phytabFile for writing\n" if ($phytabPrint);
					my $fastaFile = $gene."_gbs.fasta.txt";
						$fastaFile = $gene."_gbs.new.fasta.txt" if ($debug);
						open (FASTA, ">$fastaFile") or die "Cannot open fasta-formatted file $fastaFile for writing\n";
					
				# Print headers
					if ($sealPrint)
						{
						print SEAL "Database={\n";
						print SEAL "\tID='MLst';\n";
						print SEAL "\tOwner=null;\n";
						print SEAL "\tName=null;\n";
						print SEAL "\tDescription=null;\n";
						print SEAL "\tFlags=0;\n";
						print SEAL "\tCount=2;\n";
						print SEAL "\t{\n\t\t{\n";
						
						print SEAL "\t\t\tID='PAli';\n";
						print SEAL "\t\t\tOwner=1;\n";
						printf SEAL "\t\t\tName=\"%sgbs\";\n", $gene;
						print SEAL "\t\t\tDescription=null;\n";
						print SEAL "\t\t\tFlags=0;\n";
						print SEAL "\t\t\tNumSites=$maxLength{$gene};\n";
						print SEAL "\t\t\tType=\"Nucleotide\";\n";
						print SEAL "\t\t\tFeatures=null;\n";
						print SEAL "\t\t\tColourMode=1;\n";
						print SEAL "\t\t\tLabelMode=0;\n";
						print SEAL "\t\t\ttriplets=false;\n";
						print SEAL "\t\t\tinverse=true;\n";
						printf SEAL "\t\t\tCount=%s;\n", $sequenceCount{$gene};
						print SEAL "\t\t\t{\n";
						}

					if ($nexusPrint)
						{
						print NEX "#NEXUS\n\n";
						print NEX "Begin data;\n";
						printf NEX "\tDimensions ntax=%s nchar=%s;\n", $sequenceCount{$gene}, $maxLength{$gene};
						print NEX "\tFormat datatype=nucleotide gap=-;\n\n";
						print NEX "\tmatrix\n\n";
						}
						
				# Print sequence data
					my $i = 0;
					foreach my $entry (@accList)
						{
						next if (defined $criticalLength{$speciesName{$entry}} and $infLength{$entry} < $criticalLength{$speciesName{$entry}});
						
						$printCount{$speciesName{$entry}}++;
							$speciesName{$entry} .= "_$printCount{$speciesName{$entry}}" if ($printCount{$speciesName{$entry}} > 1);

						if ($sealPrint)
							{
							$i++;
							print SEAL "\t\t\t\t{\n";
							print SEAL "\t\t\t\t\tID='PSeq';\n";
							print SEAL "\t\t\t\t\tOwner=2;\n";
							print SEAL "\t\t\t\t\tName=\"$speciesName{$entry}\";\n";
							print SEAL "\t\t\t\t\tDescription=null;\n";
							print SEAL "\t\t\t\t\tFlags=0;\n";
							print SEAL "\t\t\t\t\tAccession=\"$entry\";\n";
							print SEAL "\t\t\t\t\tType=\"DNA\";\n";
							printf SEAL "\t\t\t\t\tLength=%s;\n", length($fastaSeq{$entry});
							print SEAL "\t\t\t\t\tSequence=\"$fastaSeq{$entry}\";\n";
							print SEAL "\t\t\t\t\tGeneticCode=1;\n";
							print SEAL "\t\t\t\t\tCodeTable=null;\n";
							print SEAL "\t\t\t\t\tFrame=1;\n";
							print SEAL "\t\t\t\t\tFeatures=null;\n";
							print SEAL "\t\t\t\t\tParent=null;\n";
							print SEAL "\t\t\t\t\tComplemented=false;\n";
							print SEAL "\t\t\t\t\tReversed=false;\n";
							print SEAL "\t\t\t\t}";
							print SEAL "," unless ($i == $sequenceCount{$gene});
							print SEAL "\n";
							}

						if ($nexusPrint)
							{
							my $gaps = "-" x ($maxLength{$gene} - length($fastaSeq{$entry}));
							print NEX "$speciesName{$entry}\t$fastaSeq{$entry}$gaps\n";
							}
						if($phytabPrint)
							{
							print PHYTAB "$speciesName{$entry} \t$gene \t $entry \t $fastaSeq{$entry} \n"
							}

						my $breakPoint = 79;
						until ($breakPoint > length($fastaSeq{$entry}))
							{
							my $replaceString = "\n" . substr($fastaSeq{$entry}, $breakPoint, 1);
							substr($fastaSeq{$entry}, $breakPoint, 1) = $replaceString;
							$breakPoint += 80;
							}
						if ($accessionFirst) #ADDED BY THO
							{
							print FASTA ">$entry", "\___", "$speciesName{$entry}\n";
							print FASTA "$fastaSeq{$entry}\n";
							}
						else
							{
							print FASTA ">$speciesName{$entry} ($entry)\n";
							print FASTA "$fastaSeq{$entry}\n";
							}
						}
				# Print footers
					if ($sealPrint)
						{
						print SEAL "\t\t\t};\n";
						print SEAL "\t\t},\n";
						print SEAL "\t\t{\n";
						print SEAL "\t\t\tID='MCoL';\n";
						print SEAL "\t\t\tOwner=1;\n";
						print SEAL "\t\t\tName=\"Genetic Codes\";\n";
						print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n";
						print SEAL "\t\t\tFlags=0;\n";
						print SEAL "\t\t\tCount=0;\n";
						print SEAL "\t\t}\n";
						print SEAL "\t};\n";
						print SEAL "};\n";
						}
					if ($nexusPrint)
						{
						print NEX "\t;\nend;\n";
						}
				
				# Close files as appropriate
					close SEAL if ($sealPrint);
					close NEX if ($nexusPrint);
					close FASTA if ($keepGene);
			}
		}

	my $postTime = time - $postZero;
	print "\n" if ($verbose);
	print "\tTime taken: $postTime seconds\n";

# Print out final summary stats / files
	print "\nFinal summary statistics\n";
	
	# Print file of stripped genes
		open (STRIP, ">$stripFile") or die "Cannot write summary of stripped genes to $stripFile.\n";
			print STRIP "The following $stripCount genes were stripped successfully from file $gbFile\n";
			print STRIP "\nGene\tNo. of sequences\tNo. of species\n\n";
	
			foreach my $gene (@globalGeneList)
				{
				print STRIP "$gene\t$sequenceCount{$gene}\t$speciesCount{$gene}\n" unless ($geneStatus{$gene} eq "rejected");
				}
		close STRIP;
		
		print "\tSummaries of $stripCount genes that were stripped from $gbFile have been written to $stripFile\n";

	# Print file of rejected genes
		if (@rejectList)
			{
			@rejectList = sort @rejectList;
			open (REJECT, ">$rejectFile") or die "Cannot write summary of rejected genes to $rejectFile.\n";
				printf REJECT "The following %s genes do not meet user-defined threshold(s) of", scalar(@rejectList);
					print REJECT " $seqThreshold sequences" if ($seqThreshold);
					print REJECT " or" if ($seqThreshold and $speciesThreshold);
					print REJECT " $speciesThreshold species" if ($speciesThreshold);
					print REJECT "; an asterisk indicates that values given are rough maximums\n";
				print REJECT "\nGene\tNo. of sequences\tNo. of species\n\n";

				foreach my $gene (@rejectList)
					{
					print REJECT "$gene\t";
					if (defined $sequenceCount{$gene})	# Need to do alternative versions depending on whether gene was properly counted or not
						{
						print REJECT "$sequenceCount{$gene}\t";
						}
					else
						{
						print REJECT "$quickSequenceCount{$gene} \*\t";
						}
					if (defined $speciesCount{$gene})
						{
						print REJECT "$speciesCount{$gene}\n";
						}
					else
						{
						print REJECT "$quickSpeciesCount{$gene} \*\n";
						}
					}
			close REJECT;

			printf "\tSummaries of %s genes that did not meet threshold(s) have been written to $rejectFile\n", scalar(@rejectList);
			}

	if ($debug)
		{
		print "\n\tSummary stats for each processed gene:\n";
		foreach my $gene (@globalGeneList)
			{
			print "\t\tGene: $gene ($geneStatus{$gene})\n";
			print "\t\t\tTotal number of sequences: $sequenceCount{$gene}\n";
			print "\t\t\tTotal number of unique species: $speciesCount{$gene}\n";
			}
		}

exit(0);

# Subroutines used in the script

sub setLineBreak	# Check line breaks of input files and set input record separator accordingly
	{
	my $gbFile = shift;
	$/ ="\n";
	open (IN, "<$gbFile") or die "Cannot open $gbFile to check form of line breaks.\n";
		while (<IN>)
			{
			if ($_ =~ /\r\n/)
				{
				print "\tDOS line breaks detected ...\n" if ($debug);
				$/ ="\r\n";
				last;
				}
			elsif ($_ =~ /\r/)
				{
				print "\tMac line breaks detected ...\n" if ($debug);
				$/ ="\r";
				last;
				}
			else
				{
				print "\tUnix line breaks detected ...\n" if ($debug);
				$/ ="\n";
				last;
				}
			}
	close IN;
	}
	
sub complement	# Outputs complementary sequence to one provided
	{
	my $tempSeq = shift;

	my $compSeq;
		for (my $nt = 0; $nt < length($tempSeq); $nt++)
			{
			if (not defined $complement{substr($tempSeq, $nt, 1)})
				{
				$compSeq .= "?";
				}
			else
				{
				$compSeq .= $complement{substr($tempSeq, $nt, 1)};
				}
			}
        my $revCompSeq = reverse($compSeq);
	return $revCompSeq;
	}

sub geneSynonyms	# Define gene synonyms; originally compiled by Robin Beck
	{
	#Opsin gene added by THO
	$synonym{$_} = "opsin" foreach ("opsin", "anceropsin", "blop", "blue-sensitive_opsin", "blue-sensitive_opsin_precursor", "blue-sensitive_rhodopsin", "blue-sensitive_visual_pigment", "blue_opsin", "bluerh", "boceropsin", "buvops", "compound_eye_opsin_bcrh1", "compound_eye_opsin_bcrh2", "lateral_eye_opsin", "locus_opsin_1", "locust_opsin_2", "long-wavelenght_opsin", "long-wavelength_like_opsin", "long-wavelength_opsin", "long-wavelength_rhodopsin", "long-wavelength_sensitive_opsin_1", "long-wavelength_sensitive_opsin_2", "long_wave_opsin", "long_wavelength-sensitive_opsin", "long_wavelength-sensitive_rhodopsin", "long_wavelength-sensitive_visual_pigment", "long_wavelength_opsin", "long_wavelength_sensitive_opsin_1", "long_wavelength_sensitive_opsin_2", "lop2", "lw_opsin", 
"ocellar_opsin", "opsin", "opsin_2", "opsin_bcrh1", "opsin_bcrh2", 
"opsin_rh1", "opsin_rh3", "opsin_rh4", "piceropsin", "pteropsin", 
"rh1_opsin", "rh2_opsin", "rh3_opsin", "rh4_opsin", "rh6_rhodopsin", 
"rhodopsin", "rhodopsin_1", "rhodopsin_2_cg16740-pa", "rhodopsin_3", 
"rhodopsin_3_cg10888-pa", "rhodopsin_4", "rhodopsin_4_cg9668-pa", 
"rhodopsin_5", "rhodopsin_5_cg5279-pa", "rhodopsin_6", 
"rhodopsin_6_cg5192-pb", "rhodopsin_7_cg5638-pa", 
"rhodopsin_long-wavelength", "short_wavelength-sensitive_opsin", 
"ultraviolet-sensitive_opsin", "ultraviolet-sensitive_rhodopsin", 
"ultraviolet-sensitive_visual_pigment", "ultraviolet_sensitive_opsin", 
"uv-sensitive_opsin", "uv-wavelength_like_opsin", "uv_opsin", "uvop", 
"uvrh", "uvrh1", "amblop", "amuvop", 
"lwrh", "lwrh1", "lwrh2", "lw", "lw-rh", "lw_rh", "ninae", "ninae-pa", 
"op", "ops", "ops1", "opsin_1", "opsin_3", "rh", "rh1", "rh2", "rh3", 
"rh2-pa", "rh3-pa", "rh4", "rh4-pa", "rh5", "rh6", "rh7", "rho", 
"visual_pigment", "long-wavelength_rodopsin", 
"long_wavelength_rhodopsin", "short-wavelength_rhodopsin");

	#Chloroplast genes added by THO***
		$synonym{$_} = "rbcl" foreach ("rbcl", "large_subunit_of_riblose-15-bisphosphate_carboxylase-oxygenase","larger_subunit_of_rubisco", 
						"rubisco_large_subunit", "ribulose_15-bisphosphate_carboxylase_large_subunit", "ribulosebiphosphate_carboxylase_large_subunit"); 
		$synonym{$_} = "matk" foreach ("matk", "maturase_k", "maturase");

	# Mitochondrial genes
		$synonym{$_} = "mtatp6" foreach ("mtatp6", "atp synthase 6", "atp synthase a chain", "atp6", "mt-atp6",
										"atpase subunit 6", "atpase6", "atpase 6", "atpase6 protein", "atp6",
										"atp synthase f0 subunit 6", "atp synthase a chain protein 6",
										"atp synthase subunit 6", "atpase6", "atp6", "atp sythase subunit 6",
										"atpase subunit 6, atpase6", "f0-atpase subunit 6", "f1atpase subunit 6",
										"f1-atpase subunit 6", "atpase 6", "atpase 6 gene", "atpase subunit 6 (atp6)",
										"atpase subunit 6 (atpase6)");
		$synonym{$_} = "mtatp8" foreach ("mtatp8", "atp synthase 8", "atp synthase protein 8", "atpase subunit 8",
										"a6l", "atp8", "mt-atp8", "atpase subunit 8", "atpase8", "atpase 8",
										"atpase8 protein", "atp8", "atp synthase f0 subunit 8", "atp synthase protein 8",
										"atpase-8", "atp synthase subunit 8", "atpase8", "atp8", "atp sythase subunit 8",
										"atpase subunit8", "f0-atpase subunit 8", "f1 atpase subunit 8", "f1-atpase subunit 8",
										"protein a6l", "atpase 8", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)",
										"atpase 8 gene", "atpase subunit 8 (atp8)", "atpase subunit 8 (atpase8)");
		$synonym{$_} = "mtco1" foreach ("mtco1", "cytochrome c oxidase i", "coi", "mt-co1", "co i", "ccoi", "cox 1", "cox i",
										"coi", "cytochrome c oxidase subunit i", "cytochrome oxidase subunit i", "cox1",
										"cytochrome c oxidase subunit 1", "cytochrome oxidase subunit 1", "co1",
										"cytochrome c oxidase polypeptide i", "cytochrome oxidase i", "cox-1",
										"cytochrome oxidase c subunit 1", "cox1", "co i", "co1", "cox 1", "coxi",
										"cytochrome c oxidase polypeptide i", "cytochrome c-oxidase subunit 1",
										"cytochrome oxidase c subunit i", "cytochrome-c oxidase i", "cytochrome c1 mrna",
										"cytochrome c1", "cytochrome c oxidase subunit 1 (coxi)", "cytochrome c oxidase chain i",
										"cytochrome c1 (aa 1-241)", "cytochrome c oxidase polypeptide 1",
										"cytochrome c oxidase subunit 1 (coi)", "cytochrome c-1");
		$synonym{$_} = "mtco2" foreach ("mtco2", "cytochrome c oxidase ii", "cytochrome c oxidase polypeptide ii", "coii",
										"mt-co2", "coii", "cytochrome c oxidase subunit ii", "cytochrome oxidase subunit ii",
										"cytochrome oxidase subunit 2", "cytochrome c oxidase subunit 2",
										"cytochrome c oxidase ii", "cox2", "co2", "cytochrome c oxidase polypeptide ii",
										"cytochrome oxidase ii", "cox2", "cytochrome oxidase c subunit 2",
										"cytochrome-c oxidase", "co ii", "co2", "cox 2", "cytochrome c oxidase polypeptide ii",
										"cytochrome c-oxidase subunit 2", "cytochrome oxidase c subunit ii",
										"cytochrome oxidase subunit2", "cytochrome-c oxidase ii", "cytochrome c oxidase subunit 2 (coxii)",
										"cytochrome c oxidase subunit 2 (coii)", "cytochrome c oxidase chain ii",
										"cytochrome c oxidase polypeptide 2", "cytochrome c oxidase ii subunit",
										"cytochrome c oxidase ii mrna");
		$synonym{$_} = "mtco3" foreach ("mtco3", "cytochrome c oxidase iii", "coiii", "mt-co3", "co iii", "ccoiii", "cox3",
										"cox iii", "cytochrome oxidase subunit iii", "coiii", "cox iii",
										"cytochrome c oxidase subunit iii", "cytochrome oxidase subunit 3", "cox3",
										"cytochrome c oxidase subunit 3", "co3", "cytochrome c oxidase polypeptide iii",
										"cox3", "cytochrome oxidase c subunit 3", "cytochrome oxidase iii",
										"cytochrome c oxidase polypeptide iii", "co iii", "coiii", "coiii protein", "coxiii",
										"cytochrome c oxidase subunit iii, coiii", "cytochrome c-oxidase subunit 3",
										"cytochrome c-oxidase subunit three", "cytochrome oxidase c subunit iii",
										"cytochrome-c oxidase iii", "cytochrme c oxidase iii", "cytochrome c oxidase subunit 3 (coiii)",
										"cytochrome oxidase subunit iii type 2");
		$synonym{$_} = "mtnd1" foreach ("mtnd1", "nd1", "nadh dehydrogenase 1", "nadh dehydrogenase, subunit 1 (complex i)",
										"mt-nd1", "nadh-ubiquinone oxidoreductase chain 1", "nd1", "nadh1", "nd1",
										"nadh-ubiquinone oxidoreductase chain 1", "nadh-ubiquinone oxidoreductase subunit 1",
										"nadh dehydrogenase i", "nadh dehydrogenase 1", "nadh subunit 1",
										"nadh dehydrogenase subnuit 1", "nadh1", "nadh1 protein",
										"nadh-ubiquinone oxidoreductase subunit i", "nadh dehydrogenase subunit 1",
										"nadh dehydrogenase subunit 1 (nd1)", "nadh dehydrogenase subunit 1 type 2",
										"nadh dehydrogenase subunit 1 type 1", "nadh dehydrogenase subunit i", "nadh dehydrogenase i",
										"nadh dehydrogenase subnit 1", "nadh dehydrogenase ubiquinone 1 alpha",
										"nadh dehydrogenase subunit i");
		$synonym{$_} = "mtnd2" foreach ("ndh-u1", "mtnd2", "nadh dehydrogenase 2", "nadh dehydrogenase, subunit 2 (complex i)",
										"nadh-ubiquinone oxidoreductase chain 2", "nd2", "mt-nd2", "urf2",
										"nadh dehydrogenase subunit 2", "nd2", "nadh2",
										"nadh-ubiquinone oxidoreductase chain 2", "nadh dehydrogenase 2", "nadh subunit 2",
										"nadh-ubiquinone oxidoreductase subunit 2", "nadh dehydrogenase subnuit 2",
										"nadh deydrogenase subunit 2", "nadh2", "nadh-ubiquinone oxidoreductase subunit ii",
										"nd2", "nadh2 protein", "nadh dehydrogenase subunit 2 (nd2)", "nadh dehydrogenase subunit ii",
										"nadh dehydrogenase ii", "nadh dehydrogense 2", "nadh dehydrogenase subunit ii");
		$synonym{$_} = "mtnd3" foreach ("mtnd3", "ndh-u3", "nadh dehydrogenase 3", "nd3", "nadh3",
										"nadh-ubiquinone oxidoreductase chain 3",
										"nadh-ubiquinone/plastoquinone oxidoreductase chain 3", "mt-md3", "urf3",
										"nadh dehydrogenase subunit 3", "nd3", "nadh3",
										"nadh-ubiquinone oxidoreductase chain 3", "nadh subunit 3", "nadh3", "nadh3 protein",
										"nadh-ubiquinone oxidoreductase subunit 3", "nadh dehydrogenase subnuit 3",
										"nadh-ubiquinone oxidoreductase subunit iii", "nadh3 protein",
										"nadh dehydrogenase subunit 3 (nd3)", "nadh dehydrogenase subunit iii", "nadh dehydrogenase iii",
										"nadh dehydrogenase subunit iii");
		$synonym{$_} = "mtnd4" foreach ("mtnd4", "ndh-u4", "nadh dehydrogenase 4", "nadh-ubiquinone oxidoreductase chain 4", "mt-nd4",
										"nd4", "urf4", "nadh:ubiquinone oxidoreductase subunit 4 (chain m)",
										"nadh dehydrogenase subunit 4", "nd4", "nadh4",
										"nadh-ubiquinone oxidoreductase chain 4", "nadh subunit 4", "nadh4",
										"nadh-ubiquinone oxidoreductase subunit 4", "nadh dehydrogenase subunit4",
										"nadh-ubiquinone oxidoreductase subunit iv", "nadh4 protein",
										"nadh dehydrogenase subunit 4 (nd4)", "nadh dehydrogenase subunit iv", "nadh dehydrogenase iv",
										"nadh dehydrogenase subunit iv");
		$synonym{$_} = "mtnd4l" foreach ("mtnd4l", "ndh-u4l", "nadh dehydrogenase 4l", "nadh dehydrogenase, subunit 4l (complex i)",
										"nadh-ubiquinone oxidoreductase chain 4l", "nd4l", "mt-md4l", "urf4l",
										"nadh dehydrogenase subunit 4l", "nd4l", "nadh4l", "nadh-ubiquinone oxidoreductase chain 4l",
										"nadh subunit 4l", "nadh4l", "nadh-ubiquinone oxidoreductase subunit 4l",
										"nadh dehydrogenase subunit 4 l", "nadh4l protein", "nadh dehydrogenase subunit 4l (nd4l)",
										"nadh dehydrogenase subunit ivl", "nadh dehydrogenase ivl", "nadh dehydrogenase subunit ivl");
		$synonym{$_} = "mtnd5" foreach ("mtnd5", "ndh-u5", "nadh dehydrogenase 5", "nd5", "nadh-ubiquinone oxidoreductase chain 5",
										"mt-nd5", "urf5", "nadh dehydrogenase subunit 5", "nadh5", "nd5",
										"nadh dehydrogenase-5", "nadh-ubiquinone oxidoreductase chain 5",
										"nadh dehydrogenase 5", "nadh 5", "nadh subunit 5",
										"nadh-ubiquinone oxidoreductase subunit 5", "nadh5", "nadh-dehydrogenase subunit 5",
										"nadh-dehydrogenase subunit 5, nd5", "nadh-ubiquinone oxidoreductase subunit v",
										"nadh5 protein", "nadh dehydrogenase subunit 5 (nd5)", "nadh dehydrogenase subunit v",
										"nadh dehydrogenase v", "nadh dehydrogenase subunit v");
		$synonym{$_} = "mtnd6" foreach ("mtnd6", "ndh-u6", "nadh dehydrogenase 6", "nd6", "nadh6",
										"nadh-ubiquinone oxidoreductase chain 6",
										"nadh-ubiquinone/plastoquinone oxidoreductase chain 6", "mt-md6", "urf6",
										"nadh dehydrogenase subunit 6", "nd6", "nadh6",
										"nadh-ubiquinone oxidoreductase chain 6", "nadh subunit 6",
										"nadh-ubiquinone oxidoreductase subunit 6", "nadh dehydrogenase 6", "nadh6",
										"nadh-ubiquinone oxidoreductase subunit vi", "nadh6 protein",
										"nadh dehydrogenase subunit 6 (nd6)", "nadh dehydrogenase subunit vi", "nadh dehydrogenase vi",
										"nadh dehydrogenase subunit vi");
		$synonym{$_} = "mtrnr1" foreach ("mtrnr1", "mt-rnr1", "12s rna", "12s rrna", "12s ribosomal rna", "12s rrna",
										"small subunit ribosomal rna", "12 rrna", "12 s ribosomal rna", "12s rrna, mtssu rrna",
										
"12s rrna gene", "12s small subunit ribosomal rna", 
"mitochondrial ssu ribosomal rna", "rrns",
										"ssu ribosomal rna", "12s","12s_ribosmal_rna", "s-rrna", "srrna", "ssu");
		$synonym{$_} = "mtrnr2" foreach ("16s", "16s_ribosomal_rna","l-rrna","lrrna","lsu", "mtrnr2", "mt-rnr2", "16s rrna", "16s rna", "16srrna", "16s ribosomal rna", "16s rrna", "rrna large subunit",
										"large subunit ribosomal rna", "16 s ribosomal rna", "16s mitochondrial ribosomal rna",
										
"mitochondrial 16s ribosomal rna", "16s large subunit ribosomal 
rna", "16s_ribosmal_rna","rrnl");

	# Nuclear genes
		$synonym{$_} = "adora3" foreach ("adora3", "adenosine a3 receptor", "a3ar", "ara3", "gpcr2", "adora3", "adenosine a3 receptor",
										"a3 adenosine receptor", "adenosine-3 receptor");
		$synonym{$_} = "adra2b" foreach ("adra2b", "adrenergic, alpha-2b-, receptor", "adra2l1", "adrarl1", "adra2rl1",
										"alpha-2b adrenergic receptor", "alpha-2b adrenoceptor", "subtype c2", "[a]2b", "adra-2b",
										"alpha2-c2", "alpha2b", "subtype alpha2-c2", "alpha-2-adrenergic receptor-like 1",
										"alpha adrenergic receptor 2b", "adra2b", "alpha 2b adrenergic receptor", "adra2b",
										"alpha adrenergic receptor subtype 2b", "alpha-2b-adrenergic receptor",
										"alpha adrenergic receptor, subtype 2b", "alpha-2b adrenergic receptor", "alpha-2b adrenoceptor");
		$synonym{$_} = "adrb2" foreach ("adrb2", "adrenergic, beta-2-, receptor, surface", "adrb2r", "adrbr", "beta-2 adrenergic receptor",
										"b2ar", "bar", "beta-2 adrenoceptor", "catecholamine receptor", "adrb-2", "badm", "beta 2-ar",
										"gpcr7", "adrb2", "beta-2 adrenergic receptor", "beta-2-adrenergic receptor",
										"adrenergic receptor beta 2", "beta 2 adrenergic receptor", "beta2-adrenergic receptor",
										"beta2 adrenergic receptor");
		$synonym{$_} = "apob" foreach ("apob", "apolipoprotein b", "ag(x) antigen", "apolipoprotein b-100 precursor", "apo b-100",
										"apolipoproteinb-48", "apo b-48", "fldb", "apob-100", "apolipoprotein b", "apolipoprotein b 100",
										"apob", "apolipoprotein b-100", "apob");
		$synonym{$_} = "app" foreach ("app", "amyloid beta (a4) precursor protein", "protease nexin-ii, alzheimer disease", "ad1",
										"human mrna for amyloid a4 precursor of alzheimer's disease", "appi", "beta-amyloid protein",
										"beta-app", "a-beta", "a4", "cvap", "aaa", "abeta", "amyloid beta-peptide",
										"amyloid beta (a4) precursor protein", "adap", "appican", "betaapp", "app",
										"amyloid beta precursor protein", "amyloid precursor protein", "amyloid beta precursor b1",
										"beta amyloid protein precursor", "beta-a4");
		$synonym{$_} = "atp7a" foreach ("atp7a", "atpase, cu++ transporting, alpha polypeptide (menkes syndrome)", "mnk",
										"copper-transporting atpase 1", "copper pump 1", "menkes disease-associated protein", "mc1",
										"copper binding p-type atpase 7a", "blo", "blotchy", "br", "brindled", "menkes protein", "mo",
										"mottled", "mk", "ohs", "atp7a", "copper transporting atpase",
										"atpase, cu++ transporting, alpha polypeptide", "menkes syndrome protein");
		$synonym{$_} = "bdnf" foreach ("bdnf", "brain-derived neurotrophic factor", "brain-derived neurotrophic factor precursor", "bdnf",
										"brain-derived neurotrophic factor", "brain derived neurotrophic factor",
										"brain-derived neurotrophic factor mature peptide", "bdnf");
		$synonym{$_} = "bmi1" foreach ("bmi1", "b lymphoma mo-mlv insertion region (mouse)",
										"murine leukemia viral (bmi-1) oncogene homolog", "polycomb complex protein bmi-1", "rnf51",
										"mgc12685", "oncogene bmi-1", "bmi1", "bmi-1", "bmi-1 protein", "oncoprotein bmi-1");
		$synonym{$_} = "brca1" foreach ("brca1", "breast cancer 1, early onset", "pscp", "papillary serous carcinoma of the peritoneum",
										"breast cancer type 1 susceptibility protein", "breast and ovarian cancer susceptibility gene",
										"rnf53", "breast-ovarian cancer, included", "brca1", "brca1",
										"breast and ovarian cancer susceptibility protein", "brca1 protein",
										"breast and ovarian cancer susceptibility");
		$synonym{$_} = "chrna1" foreach ("chrna1", "cholinergic receptor, nicotinic, alpha polypeptide 1 (muscle)", "chrna",
										"acetylcholine receptor protein, alpha chain precursor", "achra", "achr-1", "acra", "chrna1",
										"nicotinic cholinergic receptor alpha polypeptide", "chrna1", "achr", "achr prepeptide", "chrna",
										"neuronal nicotinic acetylcholine receptor alpha", "nicotinic acetylcholine recepter alpha-subunit");
		$synonym{$_} = "cftr" foreach ("cftr", "cystic fibrosis transmembrane conductance regulator, atp-binding cassette (sub-family c,
										member 7)", "mrp7", "cf", "abcc7", "abc35", "cftr", "cystic fibrosis transmembrane conductance",
										"cftr chloride channel");
		$synonym{$_} = "cnr1" foreach ("cnr1", "cannabinoid receptor 1 (brain)", "cnr", "cb1", "cb-r", "cann6", "cb>1<", "cb1k5", "cb1a",
										"central cannabinoid receptor", "cnr1", "cannabinoid receptor 1", "cb1", "cb1 cannabinoid receptor",
										"cb1 cannabinoid receptor", "cbr");
		$synonym{$_} = "crem" foreach ("crem", "camp responsive modulator", "crea", "camp-responsive element modulator, alpha isoform",
										"crem", "camp responsive element moderator", "camp responsive element modulator",
										"camp-responsive element moderator");
		$synonym{$_} = "edg1" foreach ("edg1", "endothelial differentiation, sphingolipid g-protein-coupled receptor, 1", "d1s3362", "edg-1",
										"1pb1", "s1p1", "ecgf1", "chedg1", "sphingosine 1-phosphate receptor edg1",
										"g protein-coupled sphingolipid receptor", "edg1");
		$synonym{$_} = "ghr" foreach ("ghr", "growth hormone receptor", "growth hormone receptor precursor", "gh receptor",
										"serum binding protein", "growth hormone receptor", "ghr", "growth hormone receptor precursor",
										"ghr", "bovine growth hormone receptor", "mature growth hormone receptor");
		$synonym{$_} = "pgk1" foreach ("pgk1", "phosphoglycerate kinase 1", "pgk-1", "primer recognition protein 2", "prp 2", "pgka",
										"pgk-1", "pgk-1", "phosphoglycerate kinase 1");
		$synonym{$_} = "plcb4" foreach ("plcb4", "phospholipase c, beta 4",
										"1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta 4", "plc-beta-4",
										"phospholipase c-beta-4", "plcb4", "phospholipase c beta 4");
		$synonym{$_} = "pnoc" foreach ("pnoc", "prepronociceptin", "propronociceptin", "nociceptin precursor", "orphanin fq", "ppnoc", "ofq",
										"n/ofq", "n23k", "npnc1", "ofq/n", "proorphanin", "pnoc", "prepronociceptin",
										"nociceptin/orphanin fq precursor");
		$synonym{$_} = "prkci" foreach ("prkci", "protein kinase c, iota", "pkci", "dxs1179e", "npkc-iota", "protein kinase c iota",
										"prkci");
		$synonym{$_} = "prnp" foreach ("prnp", "(prion protein (p27-30) (creutzfeld-jakob disease, gerstmann-strausler-scheinker syndrome, fatal familial insomnia)",
										"cjd", "major prion protein precursor", "prp", "prp27-30", "prp33-35c", "ascr", "prip", "gss",
										"prn-i", "prn-p", "prpc", "prpsc", "sinc", "mgc26679", "cd230 antigen", "prion-related protein",
										"prion protein", "prp", "prnp", "prion protein precursor", "prion protein", "prnp", "prp",
										"prion protein prp", "prion protein precursor prp", "prp", "prp", "prp gene", "greater kudu prp",
										"major prion protein", "prion protein variant 110p", "prion protein variant 143r",
										"prion protein variant 240s", "prion protein variant 37v", "prion protein variant 37v/240s",
										"prion protein, prp", "prnp", "prmp");
		$synonym{$_} = "rag1" foreach ("rag1", "recombination activating gene 1", "v(d)j recombination activating protein 1", "rag-1",
										"recombination activating protein 1", "rag1", "rag-1", "recombination activating gene-1",
										"recombination activating gene 1", "recombination-activating gene 1", "rag1", "rag 1", "rag1");
		$synonym{$_} = "rag2" foreach ("rag2", "recombination activator protein 2", "recombination activating protein 2", "rag-2", "rag 2",
										"recombination activating protein", "recombination activating gene-2", "rag2",
										"recombination activating protein 2", "rag2", "rag2", "rag2", "rag-2", "rag-2 protein",
										"recombinase activating gene 2", "recombination activating protein 2");
		$synonym{$_} = "rbp3" foreach ("rbp3", "retinol-binding protein 3, interstitial", "interphotoreceptor retinoid-binding protein precursor",
										"irbp", "interstitial retinol-binding protein", "rbp-3", "interphotoreceptor retinoid binding protein",
										"irbp", "irbp", "interphotoreceptor retinoid-binding protein", "interphotorecepter retinoid binding protein",
										"interphotoreceptor retinoid binding protein (irbp)", "irbp mrna");
		$synonym{$_} = "tnf" foreach ("tnf", "tumor necrosis factor (tnf superfamily, member 2)", "tnfa", "dif", "tnfsf2",
										"tumor necrosis factor (cachectin)", "tumor necrosis factor, alpha (cachectin)",
										"tumor necrosis factor", "tnf-alpha", "cachectin", "tnfsf1a", "apc1 protein",
										"tnf, monocyte-derived", "tnf, macrophage-derived", "tnf superfamily, member 2",
										"tumor necrosis factor-alpha", "tumor necrosis factor alpha", "tnfa", "tnf-alpha",
										"tumor necrosis factor alpha precursor", "tnfa", "tnfa", "tnfalpha", "tumour necrosis factor alpha",
										"tnf alpha", "tnf-a", "tumor necrosis factor alpha, tnf-alpha", "bovine tumor necrosis factor alpha",
										"tumor necrosis factor alpha (cachetin)", "tumor necrosis factor-alpha precursor");
		$synonym{$_} = "tp53" foreach ("tp53", "tumor protein p53 (li-fraumeni syndrome)", "p53", "cellular tumor antigen p53",
										"phosphoprotein p53", "trp53", "transformation related protein 53", "p53", "p53 protein", "p53",
										"tp53", "tumor suppressor p53", "53 kda phosphoprotein", "insulin recptor substrate p53 short form",
										"p53 gene product", "p53 tumor suppressor gene", "p53 tumor suppressor protein",
										"tumor suppressor p53 phosphoprotein");
		$synonym{$_} = "ttr" foreach ("ttr", "transthyretin (prealbumin, amyloidosis type i)", "palb", "tthy", "tbpa", "attr",
										"transthyretin", "transthyretin precursor", "transthyretin subunit", "ttr", "ttr");
		$synonym{$_} = "tyr" foreach ("tyr", "tyrosinase (oculocutaneous albinism ia)", "ocaia", "monophenol monooxygenase",
										"tumor rejection antigen ab", "sk29-ab", "lb24-ab", "albino", "c", "skc35", "oca1a", "tyrosinase",
										"tyr", "tyrosinase precursor", "truncated tyrosinase", "tyr", "tyr");
		$synonym{$_} = "vwf" foreach ("vwf", "von willebrand factor", "f8vwf", "coagulation factor viii", "von willebrand factor", "vwf",
										"von willebrand factor", "vwf", "vwf", "vwf", "von willebrand factor, vwf", "von willebrand factor precursor",
										"von willebrand factor precursor", "wf");
		$synonym{$_} = "zfx" foreach ("zfx", "zinc finger protein, x-linked", "zinc finger x-chromosomal protein", "zfx", "zinc finger protein zfx",
										"zfx", "zinc finger protein zfx", "x-linked zinc finger protein zfx", "zfx", "x-linked zinc finger protein",
										"zinc finger x-linked protein 1", "zinc finger x-linked protein 2", "zinc finger protein x linked",
										"zfx1", "zfx-1", "zfx2", "zfx-2","zfx protein mrna", "zinc finger protein zfx isoform 4",
										"zfx product, isoform 1", "zfx product, isoform 2", "zfx product, isoform 3",
										"zfx product, isoform 4", "zfx protein");
		$synonym{$_} = "zfy" foreach ("zfy", "zinc finger protein, y-linked", "zinc finger y-chromosomal protein", "zfy", "zfy",
										"zinc finger protein zfy", "zinc finger protein zfy", "y-linked zinc finger protein",
										"y-linked zinc finger protein zfy", "zinc finger protein y linked",
										"y-linked zinc finger protein 2", "y-linked zinc finger protein 1", "zfy1", "zfy-1", "zfy2",
										"zfy-2");

	# Additional genes
		$synonym{$_} = "18s_rrna" foreach ("18s rrna", "18s_rrna", "18s ribosomal rna", 
										"18 rrna", "18 s ribosomal rna", "18 s ribosomal rna", "18s small subunit ribosomal rna", 
										"18s_ribsomal_rna", "18s_ribosomal_rna_a-type", "18s_rna_gene", "18s_rrna", "nuclear_18s_ribosomal_rna");

		$synonym{$_} = "28s_rrna" foreach ("28s_large_subunit_ribosomal_rna", "28s rrna", "28s_rrna", "28s ribosomal rna", "28 rrna", "put. 28S ribosomal RNA", "28 s ribosomal rna", "28 s ribosomal rna",
										"rrna-28s_rrna-related", "28s ribosomal rna v region");
		$synonym{$_} = "5.8s" foreach ("5.8s_ribosomal_rna", "5.8s_rrna", "5s_ribosomal_rna","5s_rrna","5.8s_rna_gene");    
		$synonym{$_} = "mtcyb" foreach ("cob", "mtcyb", "cyt b", "cytb", "cytochrome b", "cytochrome b", "cytb", "cyt b", "cytb", "cytb",
										"'cytochrome b'", "cytochrome-b", "skeletal muscle cytochrome b", "cyt-b", "cyt.b", "cyto b",
										"cytob", "cytb1", "cytb2", "cytb3", "cytb4", "cytochrome b light strand", "cyt-b", "cytob",
										"cyt.b", "cytb", "cytochrome b", "mitochondrial cytochrome b", "cytochrome b protein",
										"duodenal cytochrome b");
		$synonym{$_} = "alpha_lactalbumin" foreach ("alpha lactalbumin", "alpha_lactalbumin", "alpha-lactalbumin", "alpla lactalbumin",
										"alpah lactalbumin", "a-lacta", "a-lacta", "lactalbumin, alpha-", "lactalbumin, alpha",
										"mature alpha-lactalbumin");
		$synonym{$_} = "alpha-s1-casein" foreach ("alpha-s1-casein", "as1-casein", "alpha s1 casein", "alpha s1-casein b",
										"alpha s1 casein", "alpha s1-casein", "alpha(s1) casein", "alpha-s1 casein", "alpha-s1 casein",
										"alpha-s1-casein", "alpha-s1-casein mrna", "alphas1-casein", "as1-casein", "alfas1-casein",
										"alpha-sl casein", "casein alpha s1");
		$synonym{$_} = "alpha-s2-casein" foreach ("alpha-s2-casein", "as2-casein", "alpha s2 casein", "alpha s2-casein b",
										"alpha s2 casein", "alpha s2-casein", "alpha(s2) casein", "alpha-s2 casein", "alpha-s2 casein",
										"alpha-s2-casein", "alpha-s2-casein mrna", "alphas2-casein", "as2-casein",
										"alpha s2a casein (aa 1 to 165)", "alpha s2a casein (aa 1 to 167)", "alph as2-casein",
										"alpha(s2)-casein");
		$synonym{$_} = "b-casein" foreach ("b-casein", "beta casein", "beta-casein", "beta-casein (aa 1 - 213)", "beta-casein a3",
										"beta-casein precursor", "beta casein precursor", "beta-casein variant a2",
										"beta-casein variant i", "mat. beta-casein", "casein beta", "casein b", "casein b (aa 1-183) pgpk48",
										"mature beta-casein (aa 1 to 226)");
		$synonym{$_} = "k-casein" foreach ("k-casein", "kappa casein", "kappa-casein", "kappa-casein precursor", "kappa casein precursor",
										"casein kappa", "kappa-cas", "kappa-casein long form", "kappa-casein mature peptide",
										"kappa-casein short form");
		$synonym{$_} = "sry" foreach ("sry", "sex determining factor sry", "sex determining region y protein", "sex-determining factor sry",
										"sex-determining region y", "sry gene", "sry protein", "sex region of y chromosome");
		$synonym{$_} = "c-mos" foreach ("c-mos", "oocyte maturation factor mos", "mos", "mos protein");
		$synonym{$_} = "cryaa" foreach ("cryaa", "crya1", "alpha a-crystallin","alpha a-crystallin", "alphaa-crystallin",
										"alpha-a crystallin chain", "alpha-a-crystallin", "crystallin, alpha a", "alphaa-crystallin (crya1)",
										"alpha a crystallin");
		$synonym{$_} = "cryab" foreach ("cryab", "crya2", "alpha b-crystallin","alpha b-crystallin", "alphab-crystallin",
										"alpha-b crystallin chain", "alpha-b-crystallin", "crystallin, alpha b", "alphab-crystallin (crya2)",
										"alpha b crystallin");
		$synonym{$_} = "t-cell_receptor_beta" foreach ("t-cell receptor beta", "t cell receptor beta", "t-cell receptor beta chain",
										"t cell receptor beta chain", "t-cell receptor beta chain variable region",
										"t cell receptor beta chain variable region", "t-cell receptor beta chain variable",
										"t cell receptor beta chain variable", "t-cell receptor variable beta chain",
										"t cell receptor variable beta chain", "t-cell receptor beta-chain",
										"t cell receptor beta-chain", "t-cell receptor beta chain vj region",
										"t cell receptor beta chain vj region", "t-cell receptor beta chain v-d-j region",
										"t cell receptor beta chain v-d-j region", "t-cell receptor beta chain variable segment",
										"t cell receptor beta chain variable segment", "t-cell receptor beta chain constant region",
										"t cell receptor beta chain constant region", "t-cell receptor v beta gene",
										"t cell receptor v beta gene", "t-cell receptor-beta chain", "t cell receptor-beta chain",
										"t cell receptor bata chain", "t-cell receptor beta-chain", "t cell receptor beta-chain",
										"t-cell receptor beta chain (v-d-j-c)", "t cell receptor beta chain (v-d-j-c)",
										"t-cell receptor beta-chain v region", "t cell receptor beta-chain v region",
										"t-cell receptor v region beta-chain", "t cell receptor v region beta-chain",
										"t-cell receptor beta chain vdj region", "t cell receptor beta chain vdj region",
										"t-cell receptor beta chain v-region", "t cell receptor beta chain v-region",
										"t-cell receptor v-beta", "t cell receptor v-beta", "t-cell_receptor_beta");
		$synonym{$_} = "t-cell_receptor_alpha" foreach ("t-cell receptor alpha", "t cell receptor alpha", "t-cell receptor alpha chain",
										"t cell receptor alpha chain", "t-cell receptor alpha chain variable region",
										"t cell receptor alpha chain variable region", "t-cell receptor alpha chain variable",
										"t cell receptor alpha chain variable", "t-cell receptor variable alpha chain",
										"t cell receptor variable alpha chain", "t-cell receptor alpha-chain",
										"t cell receptor alpha-chain", "t-cell receptor alpha chain vj region",
										"t cell receptor alpha chain vj region", "t-cell receptor alpha chain v-d-j region",
										"t cell receptor alpha chain v-d-j region", "t-cell receptor alpha chain variable segment",
										"t cell receptor alpha chain variable segment", "t-cell receptor alpha chain constant region",
										"t cell receptor alpha chain constant region", "t-cell receptor v alpha gene",
										"t cell receptor v alpha gene", "t-cell receptor-alpha chain", "t cell receptor-alpha chain",
										"t cell receptor bata chain", "t-cell receptor alpha-chain", "t cell receptor alpha-chain",
										"t-cell receptor alpha chain (v-d-j-c)", "t cell receptor alpha chain (v-d-j-c)",
										"t-cell receptor alpha-chain v region", "t cell receptor alpha-chain v region",
										"t-cell receptor v region alpha-chain", "t cell receptor v region alpha-chain",
										"t-cell receptor alpha chain vdj region", "t cell receptor alpha chain vdj region",
										"t-cell receptor alpha chain v-region", "t cell receptor alpha chain v-region",
										"t-cell receptor v-alpha", "t cell receptor v-alpha", "t-cell_receptor_alpha");

	# tRNAs
		my @aminoAcids = qw(gly ala val leu ile met phe trp pro ser thr cys tyr asn gln asp glu lys arg his);
		my %AAcodes = ('glycine' => 'gly', 'alanine' => 'ala', 'valine' => 'val', 'leucine' => 'leu', 'isoleucine' => 'ile',
							'methionine' => 'met', 'phenylalanine' => 'phe', 'tryptophan' => 'trp', 'proline' => 'pro',
							'serine' => 'ser', 'threonine' => 'thr', 'cysteine' => 'cys', 'tyrosine' => 'tyr', 'asparagine' => 'asn',
							'glutamine' => 'gln', 'aspartic acid' => 'asp', 'glutamic acid' => 'glu', 'lysine' => 'lys',
							'arginine' => 'arg', 'histidine' => 'his');
		my %fullName = ('gly' => 'glycine', 'ala' => 'alanine', 'val' => 'valine', 'leu' => 'leucine', 'ile' => 'isoleucine',
							'met' => 'methionine', 'phe' => 'phenylalanine', 'trp' => 'tryptophan', 'pro' => 'proline',
							'ser' => 'serine', 'thr' => 'threonine', 'cys' => 'cysteine', 'tyr' => 'tyrosine', 'asn' => 'asparagine',
							'gln' => 'glutamine', 'asp' => 'aspartic acid', 'glu' => 'glutamic acid', 'lys' => 'lysine',
							'arg' => 'arginine', 'his' => 'histidine');

		foreach my $aa (@aminoAcids)
			{
			# Abbreviations
				$synonym{"trna $aa"} = "trna-$aa";
				$synonym{"transfer rna $aa"} = "trna-$aa";
				$synonym{"transfer rna-$aa"} = "trna-$aa";
				$synonym{"trna($aa)"} = "trna-$aa";
				$synonym{"trna $aa gene"} = "trna-$aa";
				$synonym{"trna-$aa gene"} = "trna-$aa";
				$synonym{"$aa trna"} = "trna-$aa";
				
				my $dashedName = $aa."-trna";
					$synonym{"$dashedName"} = "trna-$aa";
			
			# And again for full names
				$synonym{"trna $fullName{$aa}"} = "trna-$aa";
				$synonym{"transfer rna $fullName{$aa}"} = "trna-$aa";
				$synonym{"transfer rna-$fullName{$aa}"} = "trna-$aa";
				$synonym{"trna-$fullName{$aa}"} = "trna-$aa";
				$synonym{"trna($fullName{$aa})"} = "trna-$aa";
				$synonym{"trna $fullName{$aa} gene"} = "trna-$aa";
				$synonym{"trna-$fullName{$aa} gene"} = "trna-$aa";
				$synonym{"$fullName{$aa} trna"} = "trna-$aa";
				
				$dashedName = $fullName{$aa}."-trna";
				$synonym{"$dashedName"} = "trna-$aa";
			}
	}

sub geneClean
	{
	my $dirtyGene = shift;

	# Initial synonymizing
		$dirtyGene = lc($dirtyGene);	# Only work with lower-case for synonymizing and file names
		$dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene});

	# Clean end of gene name
		$dirtyGene =~ s/\"$//g;
		$dirtyGene =~ s/\s+$//g;
		$dirtyGene =~ s/^\s+//g;
		
	# Remove punctuation that will confound file saving
		$dirtyGene =~ s/^'//g;
		$dirtyGene =~ s/'$//;
		$dirtyGene =~ s/:/ /g;
		$dirtyGene =~ s/, / /g;
		$dirtyGene =~ s/,//g;
		$dirtyGene =~ s/\*/-/g;
		$dirtyGene =~ s/\$/-/g;
		$dirtyGene =~ s/\#/-/g;
		$dirtyGene =~ s/\&/and/g;
		$dirtyGene =~ s/\//-/g;
		$dirtyGene =~ s/\\//g;
		$dirtyGene =~ s/\|/-/g;
		$dirtyGene =~ s/;//g;
		$dirtyGene =~ s/\<//g;
		$dirtyGene =~ s/\>//g;
		$dirtyGene =~ s/-+/-/g;
		$dirtyGene =~ s/\s+-/-/g;
		$dirtyGene =~ s/-\s+/-/g;
		$dirtyGene =~ s/^-+//;
		$dirtyGene =~ s/`/'/;
		
	# Collapse multiple whitespace
		$dirtyGene =~ s/\s+/_/g;
	
	# Clean up some tRNA variants (easier than specifying explicit synonyms for each tRNA)
		if ($dirtyGene =~ /tRNA/i)
			{
			$dirtyGene =~ s/_\d+$//;
			$dirtyGene =~ s/ \d+$//;
			}
	
	# Final synonymizing
		$dirtyGene = $synonym{$dirtyGene} if (defined $synonym{$dirtyGene});	# Recheck for synonym
		$dirtyGene = lc($dirtyGene);	# Ensure that gene names are in lower case for further synonymizing and file saving (final safety)
	
	return $dirtyGene;
	}

sub geneCount
	{
	# Set local parameters
		my $gbFile = shift;

		my (%iGene, @geneList, $speciesNum, %quickSpeciesGenePresent, %speciesCounter);
		my $wordBlock = 1;
		my $modelBlock = 0;
			my %modelSpecies;
		my $stripCount = 0;

	print "\nCounting gene occurrences in file $gbFile to establish genes that do not meet user-defined threshold(s)\n";

	# Search input file
		setLineBreak($gbFile);
		open (GENE, "<$gbFile") or die "Cannot open GenBank output file, $gbFile.\n";
			my $accCount = 0;
			my $modelFlag = 0;
			my $speciesFlag = 0;
			my $nameFlag = 0;
			my $countZero = time;
			my ($organism, $gene);
			while (<GENE>)
				{
				next unless ($_ =~ /LOCUS/ or $_ =~ /^\s*(\/gene)|(\/product)=\"/ or $_ =~ /^\s*\/organism=\"/ or $nameFlag == 1);
				chomp;
				my $gbLine = $_;
		
				if ($gbLine =~ /LOCUS/)
					{
					undef %iGene;
					undef $organism;
					$modelFlag = $speciesFlag = $nameFlag = 0;
					$accCount++;
					print "\t$accCount sequences read in\n" if ($accCount == int($accCount/10000)*10000 and $verbose);
					next;
					}
	
				# Get organism name
					if ($gbLine =~ /^\s*\/organism=\"(.+)\"/)
						{
						$organism = $1;
							$organism =~ s/\s/_/g;
						$modelFlag = 1 if (defined $modelSpecies{$organism});
						$speciesFlag = 1 if ($organism =~ /sp\./ or $organism =~ /cf\./);
						unless ($speciesFlag or ($modelFlag and $modelBlock))
							{
							$speciesCounter{$organism}++;
							$speciesNum++ if ($speciesCounter{$organism} == 1);
							}
						}
					next if ($modelFlag and $modelBlock);	# Entry pertains to model organism; skip parsing rest of entry
						
				# Get gene name
					if ($gbLine =~ /^\s*(\/gene)|(\/product)=\"(.+)/)	# Get start of gene name
						{
						$gene = $1 if ($gbLine =~ /=\"(.+)/);
						
						# Check whether name wraps onto a new line
							if (substr($gbLine, -1) ne "\"")
								{
								$nameFlag = 1;
								next;
								}
							else
								{
								$nameFlag = 2;
								$gene =~ s/\"$// if ($gene =~ /\"$/);
								}
						}
	
					if ($nameFlag == 1)	# Get continuation / end of gene name
						{
						$gene .= $gbLine;
						$nameFlag = 2 if ($gbLine =~ /\"$/)	# Gene name is complete
						}
	
					if ($gene and $nameFlag == 2)	# Process complete gene
						{
						next if (not defined $organism);
						unless ($wordBlock and ($gene =~ /hypothetical/i or $gene =~ /open reading frame/i or $gene =~ /similar/i or $gene =~ /homolog/i or $gene =~ /putative/i or $gene =~ /unknown/i or $gene =~ /unnamed/i or $gene =~ /\d+rik/i or $gene =~ /possible/i or $gene =~ /pseudo/i))
							{
							$gene = geneClean($gene);	# Clean up and synonymize gene name
		
							# Increment counters
								$iGene{$gene}++;	# Sequence counter
									$quickSequenceCount{$gene}++ if ($iGene{$gene} == 1);
	
								$quickSpeciesGenePresent{$gene}{$organism}++;	# Species-gene counter
									$quickSpeciesCount{$gene}++ if ($quickSpeciesGenePresent{$gene}{$organism} == 1);
							}
						$nameFlag = 0;
						undef $gene;
						}
				}
		close GENE;
		print "\n" if ($verbose);
		my $countTime = time - $countZero;
	
	# Print results and block genes as appropriate
		open (OUT, ">$countFile") or die "Cannot open results file for quick gene count, $countFile.\n";
			if ($speciesThreshold)
				{
				@geneList = sort {$quickSpeciesCount{$b} <=> $quickSpeciesCount{$a} } keys %quickSpeciesCount;
				}
			else
				{
				@geneList = sort {$quickSequenceCount{$b} <=> $quickSequenceCount{$a} } keys %quickSequenceCount;
				}

			printf OUT "Searching $accCount sequences took $countTime seconds with %s distinct genes found\n", scalar(@geneList);
				print OUT "\tModel organisms were excluded from search\n" if ($modelBlock);
				print OUT "\tCommon words were excluded from search\n" if ($wordBlock);
				print OUT "\nGene\tNo. of sequences\tNo. of species\n\n";
			
			print "\tGene counts:\n" if ($debug);
			foreach my $entry (@geneList)
				{
				print OUT "$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n";
				print "\t$entry\t$quickSequenceCount{$entry}\t$quickSpeciesCount{$entry}\n" if ($debug);
				if ($quickSequenceCount{$entry} < $seqThreshold or $quickSpeciesCount{$entry} < $speciesThreshold)
					{
					$geneStatus{$entry} = "rejected";
					push @rejectList, $entry;
					}
				else
					{
					$stripCount++;
					}
				}
		close OUT;
		
		printf "\tSearching $accCount sequences took $countTime seconds with %s distinct genes found; results written to $countFile\n", scalar(@geneList);
		if (not $stripCount)
			{
			print "No genes were present more than the threshold value(s)\n";
			exit(0);
			}
	}

# Version history
#
#	v2.0 (July 25, 2005)
#		- improved and more complete parsing of GenBank files (esp. of gene names or CDs covering multiple lines,
#		  join statements and other multi-segment entries, detection of pseudogenes)
#		- added ability to mine input file for all gene content
#		- added ability to strip only those genes present for 1) a minimum number of sequences and/or species and
#		  2) species with valid names only
#		- added (incomplete) gene synonymy list (initially compiled by Robin Beck)
#		- minor bug fixes
#
#	v1.0.1a (April 29, 2004)
#		- added automatic detection of line breaks
#		- minor bug fixes
#
#	v1.0 (April 30, 2002)
#		- initial release