Mercurial > repos > dereeper > pangenome_explorer
comparison PanExplorer_workflow/Perl/bp_genbank2gff3.pl @ 1:032f6b3806a3 draft
Uploaded
| author | dereeper |
|---|---|
| date | Thu, 30 May 2024 11:16:08 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:3cbb01081cde | 1:032f6b3806a3 |
|---|---|
| 1 #!/opt/anaconda1anaconda2anaconda3/bin/perl | |
| 2 | |
| 3 eval 'exec /opt/anaconda1anaconda2anaconda3/bin/perl -S $0 ${1+"$@"}' | |
| 4 if 0; # not running under some shell | |
| 5 | |
| 6 | |
| 7 | |
| 8 =pod | |
| 9 | |
| 10 =head1 NAME | |
| 11 | |
| 12 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3 | |
| 13 | |
| 14 =head1 SYNOPSIS | |
| 15 | |
| 16 bp_genbank2gff3.pl [options] filename(s) | |
| 17 | |
| 18 # process a directory containing GenBank flatfiles | |
| 19 perl bp_genbank2gff3.pl --dir path_to_files --zip | |
| 20 | |
| 21 # process a single file, ignore explicit exons and introns | |
| 22 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz | |
| 23 | |
| 24 # process a list of files | |
| 25 perl bp_genbank2gff3.pl *gbk.gz | |
| 26 | |
| 27 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader | |
| 28 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \ | |
| 29 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \ | |
| 30 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata | |
| 31 | |
| 32 Options: | |
| 33 --noinfer -r don't infer exon/mRNA subfeatures | |
| 34 --conf -i path to the curation configuration file that contains user preferences | |
| 35 for Genbank entries (must be YAML format) | |
| 36 (if --manual is passed without --ini, user will be prompted to | |
| 37 create the file if any manual input is saved) | |
| 38 --sofile -l path to to the so.obo file to use for feature type mapping | |
| 39 (--sofile live will download the latest online revision) | |
| 40 --manual -m when trying to guess the proper SO term, if more than | |
| 41 one option matches the primary tag, the converter will | |
| 42 wait for user input to choose the correct one | |
| 43 (only works with --sofile) | |
| 44 --dir -d path to a list of genbank flatfiles | |
| 45 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe) | |
| 46 --zip -z compress GFF3 output files with gzip | |
| 47 --summary -s print a summary of the features in each contig | |
| 48 --filter -x genbank feature type(s) to ignore | |
| 49 --split -y split output to separate GFF and fasta files for | |
| 50 each genbank record | |
| 51 --nolump -n separate file for each reference sequence | |
| 52 (default is to lump all records together into one | |
| 53 output file for each input file) | |
| 54 --ethresh -e error threshold for unflattener | |
| 55 set this high (>2) to ignore all unflattener errors | |
| 56 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon | |
| 57 model. --CDS is default. Use --CDS to keep default GFF gene model, | |
| 58 use --noCDS to convert to g-r-p-e. | |
| 59 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work | |
| 60 (GenBank is default) | |
| 61 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available | |
| 62 --quiet don't talk about what is being processed | |
| 63 --typesource SO sequence type for source (e.g. chromosome; region; contig) | |
| 64 --help -h display this message | |
| 65 | |
| 66 | |
| 67 =head1 DESCRIPTION | |
| 68 | |
| 69 This script uses Bio::SeqFeature::Tools::Unflattener and | |
| 70 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene | |
| 71 containment hierarchies mapped for optimal display in gbrowse. | |
| 72 | |
| 73 The input files are assumed to be gzipped GenBank flatfiles for refseq | |
| 74 contigs. The files may contain multiple GenBank records. Either a | |
| 75 single file or an entire directory can be processed. By default, the | |
| 76 DNA sequence is embedded in the GFF but it can be saved into separate | |
| 77 fasta file with the --split(-y) option. | |
| 78 | |
| 79 If an input file contains multiple records, the default behaviour is | |
| 80 to dump all GFF and sequence to a file of the same name (with .gff | |
| 81 appended). Using the 'nolump' option will create a separate file for | |
| 82 each genbank record. Using the 'split' option will create separate | |
| 83 GFF and Fasta files for each genbank record. | |
| 84 | |
| 85 | |
| 86 =head2 Notes | |
| 87 | |
| 88 =head3 'split' and 'nolump' produce many files | |
| 89 | |
| 90 In cases where the input files contain many GenBank records (for | |
| 91 example, the chromosome files for the mouse genome build), a very | |
| 92 large number of output files will be produced if the 'split' or | |
| 93 'nolump' options are selected. If you do have lists of files E<gt> 6000, | |
| 94 use the --long_list option in bp_bulk_load_gff.pl or | |
| 95 bp_fast_load_gff.pl to load the gff and/ or fasta files. | |
| 96 | |
| 97 =head3 Designed for RefSeq | |
| 98 | |
| 99 This script is designed for RefSeq genomic sequence entries. It may | |
| 100 work for third party annotations but this has not been tested. | |
| 101 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl | |
| 102 if you don't mind some gene model unflattener errors (dgg). | |
| 103 | |
| 104 =head3 G-R-P-E Gene Model | |
| 105 | |
| 106 Don Gilbert worked this over with needs to produce GFF3 suited to | |
| 107 loading to GMOD Chado databases. Most of the changes I believe are | |
| 108 suited for general use. One main chado-specific addition is the | |
| 109 --[no]cds2protein flag | |
| 110 | |
| 111 My favorite GFF is to set the above as ON by default (disable with --nocds2prot) | |
| 112 For general use it probably should be OFF, enabled with --cds2prot. | |
| 113 | |
| 114 This writes GFF with an alternate, but useful Gene model, | |
| 115 instead of the consensus model for GFF3 | |
| 116 | |
| 117 [ gene > mRNA> (exon,CDS,UTR) ] | |
| 118 | |
| 119 This alternate is | |
| 120 | |
| 121 gene > mRNA > polypeptide > exon | |
| 122 | |
| 123 means the only feature with dna bases is the exon. The others | |
| 124 specify only location ranges on a genome. Exon of course is a child | |
| 125 of mRNA and protein/peptide. | |
| 126 | |
| 127 The protein/polypeptide feature is an important one, having all the | |
| 128 annotations of the GenBank CDS feature, protein ID, translation, GO | |
| 129 terms, Dbxrefs to other proteins. | |
| 130 | |
| 131 UTRs, introns, CDS-exons are all inferred from the primary exon bases | |
| 132 inside/outside appropriate higher feature ranges. Other special gene | |
| 133 model features remain the same. | |
| 134 | |
| 135 Several other improvements and bugfixes, minor but useful are included | |
| 136 | |
| 137 * IO pipes now work: | |
| 138 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ... | |
| 139 | |
| 140 * GenBank main record fields are added to source feature, e.g. organism, date, | |
| 141 and the sourcetype, commonly chromosome for genomes, is used. | |
| 142 | |
| 143 * Gene Model handling for ncRNA, pseudogenes are added. | |
| 144 | |
| 145 * GFF header is cleaner, more informative. | |
| 146 --GFF_VERSION flag allows choice of v2 as well as default v3 | |
| 147 | |
| 148 * GFF ##FASTA inclusion is improved, and | |
| 149 CDS translation sequence is moved to FASTA records. | |
| 150 | |
| 151 * FT -> GFF attribute mapping is improved. | |
| 152 | |
| 153 * --format choice of SeqIO input formats (GenBank default). | |
| 154 Uniprot/Swissprot and EMBL work and produce useful GFF. | |
| 155 | |
| 156 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions | |
| 157 and more flexible usage. | |
| 158 | |
| 159 =head1 TODO | |
| 160 | |
| 161 =head2 Are these additions desired? | |
| 162 | |
| 163 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY | |
| 164 * handle Entrezgene, other non-sequence SeqIO structures (really should change | |
| 165 those parsers to produce consistent annotation tags). | |
| 166 | |
| 167 =head2 Related bugfixes/tests | |
| 168 | |
| 169 These items from Bioperl mail were tested (sample data generating | |
| 170 errors), and found corrected: | |
| 171 | |
| 172 From: Ed Green <green <at> eva.mpg.de> | |
| 173 Subject: genbank2gff3.pl on new human RefSeq | |
| 174 Date: 2006-03-13 21:22:26 GMT | |
| 175 -- unspecified errors (sample data works now). | |
| 176 | |
| 177 From: Eric Just <e-just <at> northwestern.edu> | |
| 178 Subject: genbank2gff3.pl | |
| 179 Date: 2007-01-26 17:08:49 GMT | |
| 180 -- bug fixed in genbank2gff3 for multi-record handling | |
| 181 | |
| 182 This error is for a /trans_splice gene that is hard to handle, and | |
| 183 unflattner/genbank2 doesn't | |
| 184 | |
| 185 From: Chad Matsalla <chad <at> dieselwurks.com> | |
| 186 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order? | |
| 187 Date: 2005-07-15 19:51:48 GMT | |
| 188 | |
| 189 =head1 AUTHOR | |
| 190 | |
| 191 Sheldon McKay (mckays@cshl.edu) | |
| 192 | |
| 193 Copyright (c) 2004 Cold Spring Harbor Laboratory. | |
| 194 | |
| 195 =head2 AUTHOR of hacks for GFF2Chado loading | |
| 196 | |
| 197 Don Gilbert (gilbertd@indiana.edu) | |
| 198 | |
| 199 | |
| 200 =cut | |
| 201 | |
| 202 use strict; | |
| 203 use warnings; | |
| 204 | |
| 205 use lib "$ENV{HOME}/bioperl-live"; | |
| 206 # chad put this here to enable situations when this script is tested | |
| 207 # against bioperl compiled into blib along with other programs using blib | |
| 208 BEGIN { | |
| 209 unshift(@INC,'blib/lib'); | |
| 210 }; | |
| 211 use Pod::Usage; | |
| 212 use Bio::Root::RootI; | |
| 213 use Bio::SeqIO; | |
| 214 use File::Spec; | |
| 215 use Bio::SeqFeature::Tools::Unflattener; | |
| 216 use Bio::SeqFeature::Tools::TypeMapper; | |
| 217 use Bio::SeqFeature::Tools::IDHandler; | |
| 218 use Bio::Location::SplitLocationI; | |
| 219 use Bio::Location::Simple; | |
| 220 use Bio::Tools::GFF; | |
| 221 use Getopt::Long; | |
| 222 use List::Util qw(first); | |
| 223 use Bio::OntologyIO; | |
| 224 use YAML qw(Dump LoadFile DumpFile); | |
| 225 use File::Basename; | |
| 226 | |
| 227 use vars qw/$split @filter $zip $outdir $help $ethresh | |
| 228 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT | |
| 229 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE | |
| 230 $file @files $dir $summary $nolump | |
| 231 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION | |
| 232 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/; | |
| 233 | |
| 234 use constant SO_URL => | |
| 235 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo'; | |
| 236 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)]; | |
| 237 use constant ALPHABET_TO_NUMBER => { | |
| 238 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8, | |
| 239 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16, | |
| 240 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24, | |
| 241 z => 25, | |
| 242 }; | |
| 243 use constant ALPHABET_DIVISOR => 26; | |
| 244 use constant GM_NEW_TOPLEVEL => 2; | |
| 245 use constant GM_NEW_PART => 1; | |
| 246 use constant GM_DUP_PART => 0; | |
| 247 use constant GM_NOT_PART => -1; | |
| 248 | |
| 249 # Options cycle in multiples of 2 because of left side/right side pairing. | |
| 250 # You can make this number odd, but displayed matches will still round up | |
| 251 use constant OPTION_CYCLE => 6; | |
| 252 | |
| 253 $GFF_VERSION = 3; # allow v2 ... | |
| 254 $verbose = 1; # right default? -nov to turn off | |
| 255 | |
| 256 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons... | |
| 257 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features() | |
| 258 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep; | |
| 259 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support | |
| 260 | |
| 261 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID ** | |
| 262 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work | |
| 263 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG | |
| 264 | |
| 265 | |
| 266 my %TAG_MAP = ( | |
| 267 db_xref => 'Dbxref', | |
| 268 name => 'Name', | |
| 269 note => 'Note', # also pull GO: ids into Ontology_term | |
| 270 synonym => 'Alias', | |
| 271 symbol => 'Alias', # is symbol still used? | |
| 272 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number | |
| 273 # translation: handled in gene_features | |
| 274 ); | |
| 275 | |
| 276 | |
| 277 $| = 1; | |
| 278 my $quiet= !$verbose; | |
| 279 my $ok= GetOptions( 'd|dir|input:s' => \$dir, | |
| 280 'z|zip' => \$zip, | |
| 281 'h|help' => \$help, | |
| 282 's|summary' => \$summary, | |
| 283 'r|noinfer' => \$noinfer, | |
| 284 'i|conf=s' => \$CONF, | |
| 285 'sofile=s' => \$SO_FILE, | |
| 286 'm|manual' => \$MANUAL, | |
| 287 'o|outdir|output:s'=> \$outdir, | |
| 288 'x|filter:s'=> \@filter, | |
| 289 'y|split' => \$split, | |
| 290 "ethresh|e=s"=>\$ethresh, | |
| 291 'c|CDS!' => \$CDSkeep, | |
| 292 'f|format=s' => \$FORMAT, | |
| 293 'typesource=s' => \$source_type, | |
| 294 'GFF_VERSION=s' => \$GFF_VERSION, | |
| 295 'quiet!' => \$quiet, # swap quiet to verbose | |
| 296 'DEBUG!' => \$DEBUG, | |
| 297 'n|nolump' => \$nolump); | |
| 298 | |
| 299 my $lump = 1 unless $nolump || $split; | |
| 300 $verbose= !$quiet; | |
| 301 | |
| 302 # look for help request | |
| 303 pod2usage(2) if $help || !$ok; | |
| 304 | |
| 305 # keep SOURCEID as-is and change FORMAT for SeqIO types; | |
| 306 # note SeqIO uses file.suffix to guess type; not useful here | |
| 307 $SOURCEID= $FORMAT; | |
| 308 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/; | |
| 309 $verbose =1 if($DEBUG); | |
| 310 | |
| 311 # initialize handlers | |
| 312 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1); | |
| 313 $unflattener->error_threshold($ethresh) if $ethresh; | |
| 314 $unflattener->verbose(1) if($DEBUG); | |
| 315 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only? | |
| 316 # ensembl parsing is still problematic, forget this | |
| 317 | |
| 318 my $tm = Bio::SeqFeature::Tools::TypeMapper->new; | |
| 319 my $idh = Bio::SeqFeature::Tools::IDHandler->new; | |
| 320 | |
| 321 # dgg | |
| 322 $source_type ||= "region"; # should really parse from FT.source contents below | |
| 323 | |
| 324 #my $FTSOmap = $tm->FT_SO_map(); | |
| 325 my $FTSOmap; | |
| 326 my $FTSOsynonyms; | |
| 327 | |
| 328 if (defined($SO_FILE) && $SO_FILE eq 'live') { | |
| 329 print "\nDownloading the latest SO file from ".SO_URL."\n\n"; | |
| 330 use LWP::UserAgent; | |
| 331 my $ua = LWP::UserAgent->new(timeout => 30); | |
| 332 my $request = HTTP::Request->new(GET => SO_URL); | |
| 333 my $response = $ua->request($request); | |
| 334 | |
| 335 | |
| 336 if ($response->status_line =~ /200/) { | |
| 337 use File::Temp qw/ tempfile /; | |
| 338 my ($fh, $fn) = tempfile(); | |
| 339 print $fh $response->content; | |
| 340 $SO_FILE = $fn; | |
| 341 } else { | |
| 342 print "Couldn't download SO file online...skipping validation.\n" | |
| 343 . "HTTP Status was " . $response->status_line . "\n" | |
| 344 and undef $SO_FILE | |
| 345 } | |
| 346 } | |
| 347 | |
| 348 if ($SO_FILE) { | |
| 349 | |
| 350 | |
| 351 my (%terms, %syn); | |
| 352 | |
| 353 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE ); | |
| 354 $ONTOLOGY = $parser->next_ontology(); | |
| 355 | |
| 356 for ($ONTOLOGY->get_all_terms) { | |
| 357 my $feat = $_; | |
| 358 | |
| 359 $terms{$feat->name} = $feat->name; | |
| 360 #$terms{$feat->name} = $feat; | |
| 361 | |
| 362 my @syn = $_->each_synonym; | |
| 363 | |
| 364 push @{$syn{$_}}, $feat->name for @syn; | |
| 365 #push @{$syn{$_}}, $feat for @syn; | |
| 366 } | |
| 367 | |
| 368 $FTSOmap = \%terms; | |
| 369 $FTSOsynonyms = \%syn; | |
| 370 | |
| 371 my %hardTerms = %{ $tm->FT_SO_map() }; | |
| 372 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms; | |
| 373 | |
| 374 } else { | |
| 375 my %terms = %{ $tm->FT_SO_map() }; | |
| 376 while (my ($k,$v) = each %terms) { | |
| 377 $FTSOmap->{$k} = ref($v) ? shift @$v : $v; | |
| 378 } | |
| 379 } | |
| 380 | |
| 381 $TYPE_MAP = $FTSOmap; | |
| 382 $SYN_MAP = $FTSOsynonyms; | |
| 383 | |
| 384 | |
| 385 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region") | |
| 386 | |
| 387 # stringify filter list if applicable | |
| 388 my $filter = join ' ', @filter if @filter; | |
| 389 | |
| 390 # determine input files | |
| 391 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use | |
| 392 if ($dir && ($dir eq '-' || $dir eq 'stdin')) { | |
| 393 $stdin=1; $dir=''; @files=('stdin'); | |
| 394 | |
| 395 } elsif ( $dir ) { | |
| 396 if ( -d $dir ) { | |
| 397 opendir DIR, $dir or die "could not open $dir for reading: $!"; | |
| 398 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR; | |
| 399 closedir DIR; | |
| 400 } | |
| 401 else { | |
| 402 die "$dir is not a directory\n"; | |
| 403 } | |
| 404 } | |
| 405 else { | |
| 406 @files = @ARGV; | |
| 407 $dir = ''; | |
| 408 } | |
| 409 | |
| 410 # we should have some files by now | |
| 411 pod2usage(2) unless @files; | |
| 412 | |
| 413 | |
| 414 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use | |
| 415 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) { | |
| 416 warn("std. output chosen: cannot split\n") if($split); | |
| 417 warn("std. output chosen: cannot zip\n") if($zip); | |
| 418 warn("std. output chosen: cannot nolump\n") if($nolump); | |
| 419 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip | |
| 420 | |
| 421 } elsif ( $outdir && !-e $outdir ) { | |
| 422 mkdir($outdir) or die "could not create directory $outdir: $!\n"; | |
| 423 } | |
| 424 elsif ( !$outdir ) { | |
| 425 $outdir = $dir || '.'; | |
| 426 } | |
| 427 | |
| 428 for my $file ( @files ) { | |
| 429 # dgg ; allow 'stdin' / '-' input ? | |
| 430 chomp $file; | |
| 431 die "$! $file" unless($stdin || -e $file); | |
| 432 print "# Input: $file\n" if($verbose); | |
| 433 | |
| 434 my ($lump_fh, $lumpfa_fh, $outfile, $outfa); | |
| 435 if ($stdout) { | |
| 436 $lump_fh= *STDOUT; $lump="stdout$$"; | |
| 437 $outfa= "stdout$$.fa"; # this is a temp file ... see below | |
| 438 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n"; | |
| 439 | |
| 440 } elsif ( $lump ) { | |
| 441 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file); | |
| 442 $lump = File::Spec->catfile($outdir, $fileonly.'.gff'); | |
| 443 ($outfa = $lump) =~ s/\.gff/\.fa/; | |
| 444 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n"; | |
| 445 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n"; | |
| 446 | |
| 447 } | |
| 448 | |
| 449 # open input file, unzip if req'd | |
| 450 if ($stdin) { | |
| 451 *FH = *STDIN; | |
| 452 } elsif ( $file =~ /\.gz/ ) { | |
| 453 open FH, "gunzip -c $file |"; | |
| 454 } | |
| 455 else { | |
| 456 open FH, '<', $file; | |
| 457 } | |
| 458 | |
| 459 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG); | |
| 460 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION ); | |
| 461 | |
| 462 while ( my $seq = $in->next_seq() ) { | |
| 463 my $seq_name = $seq->accession_number; | |
| 464 my $end = $seq->length; | |
| 465 my @to_print; | |
| 466 | |
| 467 # arrange disposition of GFF output | |
| 468 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff'); | |
| 469 my $out; | |
| 470 | |
| 471 if ( $lump ) { | |
| 472 $outfile = $lump; | |
| 473 $out = $lump_fh; | |
| 474 } | |
| 475 else { | |
| 476 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff'); | |
| 477 open $out, ">$outfile"; | |
| 478 } | |
| 479 | |
| 480 # filter out unwanted features | |
| 481 my $source_feat= undef; | |
| 482 my @source= filter($seq); $source_feat= $source[0]; | |
| 483 | |
| 484 ($source_type,$source_feat)= | |
| 485 getSourceInfo( $seq, $source_type, $source_feat ) ; | |
| 486 # always; here we build main prot $source_feat; # if @source; | |
| 487 | |
| 488 # abort if there are no features | |
| 489 warn "$seq_name has no features, skipping\n" and next | |
| 490 if !$seq->all_SeqFeatures; | |
| 491 | |
| 492 | |
| 493 $FTSOmap->{'source'} = $source_type; | |
| 494 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features | |
| 495 | |
| 496 # construct a GFF header | |
| 497 # add: get source_type from attributes of source feature? chromosome=X tag | |
| 498 # also combine 1st ft line here with source ft from $seq .. | |
| 499 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat); | |
| 500 print $out $header; | |
| 501 print "# working on $info\n" if($verbose); | |
| 502 | |
| 503 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper .. | |
| 504 unflatten_seq($seq); | |
| 505 | |
| 506 # Note that we use our own get_all_SeqFeatures function | |
| 507 # to rescue cloned exons | |
| 508 | |
| 509 @GFF_LINE_FEAT = (); | |
| 510 for my $feature ( get_all_SeqFeatures($seq) ) { | |
| 511 | |
| 512 my $method = $feature->primary_tag; | |
| 513 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type); | |
| 514 | |
| 515 $feature->seq_id($seq->id) unless($feature->seq_id); | |
| 516 $feature->source_tag($SOURCEID); | |
| 517 | |
| 518 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref; | |
| 519 ## also, pull any GO:000 ids from /note tag and put into Ontology_term | |
| 520 maptags2gff($feature); | |
| 521 | |
| 522 # current gene name. The unflattened gene features should be in order so any | |
| 523 # exons, CDSs, etc that follow will belong to this gene | |
| 524 my $gene_name; | |
| 525 if ( $method eq 'gene' || $method eq 'pseudogene' ) { | |
| 526 @to_print= print_held($out, $gffio, \@to_print); | |
| 527 $gene_id = $gene_name= gene_name($feature); | |
| 528 } else { | |
| 529 $gene_name= gene_name($feature); | |
| 530 } | |
| 531 | |
| 532 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx | |
| 533 # be converted to or added as Name=xxx (if not ID= or as well) | |
| 534 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags | |
| 535 convert_to_name($feature); | |
| 536 | |
| 537 ## dgg: extended to protein|polypeptide | |
| 538 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes | |
| 539 ## in yeast have that genbank tag; why? | |
| 540 ## these include pseudogene ... | |
| 541 | |
| 542 ## Note we also have mapped types to SO, so these RNA's are now transcripts: | |
| 543 # pseudomRNA => "pseudogenic_transcript", | |
| 544 # pseudotranscript" => "pseudogenic_transcript", | |
| 545 # misc_RNA=>'processed_transcript', | |
| 546 | |
| 547 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG; | |
| 548 | |
| 549 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/ | |
| 550 || ( $gene_id && $gene_name eq $gene_id ) ) { | |
| 551 | |
| 552 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result | |
| 553 if ($action == GM_DUP_PART) { | |
| 554 # ignore, this is dupl. exon with new parent ... | |
| 555 | |
| 556 } elsif ($action == GM_NOT_PART) { | |
| 557 add_generic_id( $feature, $gene_name, "nocount"); | |
| 558 my $gff = $gffio->gff_string($feature); | |
| 559 push @GFF_LINE_FEAT, $feature; | |
| 560 #print $out "$gff\n" if $gff; | |
| 561 | |
| 562 } elsif ($action > 0) { | |
| 563 # hold off print because exon etc. may get 2nd, 3rd parents | |
| 564 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL); | |
| 565 push(@to_print, $feature); | |
| 566 } | |
| 567 | |
| 568 } | |
| 569 | |
| 570 # otherwise handle as generic feats with IDHandler labels | |
| 571 else { | |
| 572 add_generic_id( $feature, $gene_name, ""); | |
| 573 my $gff= $gffio->gff_string($feature); | |
| 574 push @GFF_LINE_FEAT, $feature; | |
| 575 #print $out "$gff\n" if $gff; | |
| 576 } | |
| 577 } | |
| 578 | |
| 579 # don't like doing this after others; do after each new gene id? | |
| 580 @to_print= print_held($out, $gffio, \@to_print); | |
| 581 | |
| 582 gff_validate(@GFF_LINE_FEAT); | |
| 583 | |
| 584 for my $feature (@GFF_LINE_FEAT) { | |
| 585 my $gff= $gffio->gff_string($feature); | |
| 586 print $out "$gff\n" if $gff; | |
| 587 } | |
| 588 | |
| 589 # deal with the corresponding DNA | |
| 590 my ($fa_out,$fa_outfile); | |
| 591 my $dna = $seq->seq; | |
| 592 if($dna || %proteinfa) { | |
| 593 $method{'RESIDUES'} += length($dna); | |
| 594 $dna =~ s/(\S{60})/$1\n/g; | |
| 595 $dna .= "\n"; | |
| 596 | |
| 597 if ($split) { | |
| 598 $fa_outfile = $outfile; | |
| 599 $fa_outfile =~ s/gff$/fa/; | |
| 600 open $fa_out, ">$fa_outfile" or die $!; | |
| 601 print $fa_out ">$seq_name\n$dna" if $dna; | |
| 602 foreach my $aid (sort keys %proteinfa) { | |
| 603 my $aa= delete $proteinfa{$aid}; | |
| 604 $method{'RESIDUES(tr)'} += length($aa); | |
| 605 $aa =~ s/(\S{60})/$1\n/g; | |
| 606 print $fa_out ">$aid\n$aa\n"; | |
| 607 } | |
| 608 | |
| 609 } | |
| 610 else { | |
| 611 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out | |
| 612 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk | |
| 613 ## maybe write this to temp .fa then cat to end of lumped gff $out | |
| 614 print $lumpfa_fh ">$seq_name\n$dna" if $dna; | |
| 615 foreach my $aid (sort keys %proteinfa) { | |
| 616 my $aa= delete $proteinfa{$aid}; | |
| 617 $method{'RESIDUES(tr)'} += length($aa); | |
| 618 $aa =~ s/(\S{60})/$1\n/g; | |
| 619 print $lumpfa_fh ">$aid\n$aa\n"; | |
| 620 } | |
| 621 } | |
| 622 | |
| 623 %proteinfa=(); | |
| 624 } | |
| 625 | |
| 626 if ( $zip && !$lump ) { | |
| 627 system "gzip -f $outfile"; | |
| 628 system "gzip -f $fa_outfile" if($fa_outfile); | |
| 629 $outfile .= '.gz'; | |
| 630 $fa_outfile .= '.gz' if $split; | |
| 631 } | |
| 632 | |
| 633 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA | |
| 634 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump); | |
| 635 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump); | |
| 636 | |
| 637 # dgg: moved to after all inputs; here it prints cumulative sum for each record | |
| 638 #if ( $summary ) { | |
| 639 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n"; | |
| 640 # | |
| 641 # for ( keys %method ) { | |
| 642 # print "# $_ $method{$_}\n"; | |
| 643 # } | |
| 644 # print "# \n"; | |
| 645 # } | |
| 646 | |
| 647 } | |
| 648 | |
| 649 print "# GFF3 saved to $outfile\n" if( $verbose && $lump); | |
| 650 if ( $summary ) { | |
| 651 print "# Summary:\n# Feature\tCount\n# -------\t-----\n"; | |
| 652 for ( keys %method ) { | |
| 653 print "# $_ $method{$_}\n"; | |
| 654 } | |
| 655 print "# \n"; | |
| 656 } | |
| 657 | |
| 658 ## FIXME for piped output w/ split FA files ... | |
| 659 close($lumpfa_fh) if $lumpfa_fh; | |
| 660 if (!$split && $outfa && $lump_fh) { | |
| 661 print $lump_fh "##FASTA\n"; # GFF3 spec | |
| 662 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!"; | |
| 663 while( <$lumpfa_fh>) { | |
| 664 print $lump_fh $_; | |
| 665 } # is $lump_fh still open? | |
| 666 close($lumpfa_fh); | |
| 667 unlink($outfa); | |
| 668 } | |
| 669 | |
| 670 | |
| 671 if ( $zip && $lump ) { | |
| 672 system "gzip -f $lump"; | |
| 673 } | |
| 674 | |
| 675 close FH; | |
| 676 } | |
| 677 | |
| 678 | |
| 679 | |
| 680 | |
| 681 | |
| 682 sub typeorder { | |
| 683 return 1 if ($_[0] =~ /gene/); | |
| 684 return 2 if ($_[0] =~ /RNA|transcript/); | |
| 685 return 3 if ($_[0] =~ /protein|peptide/); | |
| 686 return 4 if ($_[0] =~ /exon|CDS/); | |
| 687 return 3; # default before exon (smallest part) | |
| 688 } | |
| 689 | |
| 690 sub sort_by_feattype { | |
| 691 my($at,$bt)= ($a->primary_tag, $b->primary_tag); | |
| 692 return (typeorder($at) <=> typeorder($bt)) | |
| 693 or ($at cmp $bt); | |
| 694 ## or ($a->name() cmp $b->name()); | |
| 695 } | |
| 696 | |
| 697 sub print_held { | |
| 698 my($out,$gffio,$to_print)= @_; | |
| 699 return unless(@$to_print); | |
| 700 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes | |
| 701 while ( my $feature = shift @$to_print) { | |
| 702 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode | |
| 703 push @GFF_LINE_FEAT, $feature; | |
| 704 #print $out "$gff\n"; | |
| 705 } | |
| 706 return (); # @to_print | |
| 707 } | |
| 708 | |
| 709 sub maptags2gff { | |
| 710 my $f = shift; | |
| 711 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already | |
| 712 # but see below /gene /locus_tag usage | |
| 713 foreach my $tag (keys %TAG_MAP) { | |
| 714 if ($f->has_tag($tag)) { | |
| 715 my $newtag= $TAG_MAP{$tag}; | |
| 716 my @v= $f->get_tag_values($tag); | |
| 717 $f->remove_tag($tag); | |
| 718 $f->add_tag_value($newtag,@v); | |
| 719 | |
| 720 ## also, pull any GO:000 ids from /note tag and put into Ontology_term | |
| 721 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]' | |
| 722 if ($tag eq 'note') { | |
| 723 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v; | |
| 724 my @go= map { m/(GO:\d+)/g } @v; | |
| 725 $f->add_tag_value('Ontology_term',@go) if(@go); | |
| 726 } | |
| 727 | |
| 728 } | |
| 729 } | |
| 730 } | |
| 731 | |
| 732 | |
| 733 sub getSourceInfo { | |
| 734 my ($seq, $source_type, $sf) = @_; | |
| 735 | |
| 736 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i); | |
| 737 my $is_gene = ($SOURCEID =~/entrezgene/i); | |
| 738 my $is_rich = (ref($seq) =~ /RichSeq/); | |
| 739 my $seq_name= $seq->accession_number(); | |
| 740 | |
| 741 unless($sf) { # make one | |
| 742 $source_type= $is_swiss ? $PROTEIN_TYPE | |
| 743 : $is_gene ? "eneg" # "gene" # "region" # | |
| 744 : $is_rich ? $seq->molecule : $source_type; | |
| 745 $sf = Bio::SeqFeature::Generic->direct_new(); | |
| 746 | |
| 747 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1); | |
| 748 my $loc= $seq->can('location') ? $seq->location() | |
| 749 : new Bio::Location::Simple( -start => $start, -end => $len); | |
| 750 $sf->location( $loc ); | |
| 751 $sf->primary_tag($source_type); | |
| 752 $sf->source_tag($SOURCEID); | |
| 753 $sf->seq_id( $seq_name); | |
| 754 #? $sf->display_name($seq->id()); ## Name or Alias ? | |
| 755 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession | |
| 756 $seq->add_SeqFeature($sf); | |
| 757 ## $source_feat= $sf; | |
| 758 } | |
| 759 | |
| 760 if ($sf->has_tag("chromosome")) { | |
| 761 $source_type= "chromosome"; | |
| 762 my ($chrname) = $sf->get_tag_values("chromosome"); | |
| 763 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead | |
| 764 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now | |
| 765 $sf->add_tag_value( Alias => $chrname ); | |
| 766 } | |
| 767 | |
| 768 # pull GB Comment, Description for source ft ... | |
| 769 # add reference - can be long, not plain string... | |
| 770 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG; | |
| 771 # GenBank fields: keyword,comment,reference,date_changed | |
| 772 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM | |
| 773 | |
| 774 # is this just for main $seq object or for all seqfeatures ? | |
| 775 my %AnnotTagMap= ( | |
| 776 'gene_name' => 'Alias', | |
| 777 'ALIAS_SYMBOL' => 'Alias', # Entrezgene | |
| 778 'LOCUS_SYNONYM' => 'Alias', #? | |
| 779 'symbol' => 'Alias', | |
| 780 'synonym' => 'Alias', | |
| 781 'dblink' => 'Dbxref', | |
| 782 'product' => 'product', | |
| 783 'Reference' => 'reference', | |
| 784 'OntologyTerm' => 'Ontology_term', | |
| 785 'comment' => 'Note', | |
| 786 'comment1' => 'Note', | |
| 787 # various map-type locations | |
| 788 # gene accession tag is named per source db !?? | |
| 789 # 'Index terms' => keywords ?? | |
| 790 ); | |
| 791 | |
| 792 | |
| 793 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() ); | |
| 794 my ($date)= $seq->annotation->get_Annotations("dates") | |
| 795 || $seq->annotation->get_Annotations("update-date") | |
| 796 || $is_rich ? $seq->get_dates() : (); | |
| 797 my ($comment)= $seq->annotation->get_Annotations("comment"); | |
| 798 my ($species)= $seq->annotation->get_Annotations("species"); | |
| 799 if (!$species | |
| 800 && $seq->can('species') | |
| 801 && defined $seq->species() | |
| 802 && $seq->species()->can('binomial') ) { | |
| 803 $species= $seq->species()->binomial(); | |
| 804 } | |
| 805 | |
| 806 # update source feature with main GB fields | |
| 807 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID'); | |
| 808 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note')); | |
| 809 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism')); | |
| 810 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1')); | |
| 811 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date')); | |
| 812 | |
| 813 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene; | |
| 814 | |
| 815 foreach my $atag (sort keys %AnnotTagMap) { | |
| 816 my $gtag= $AnnotTagMap{$atag}; next unless($gtag); | |
| 817 my @anno = map{ | |
| 818 if (ref $_ && $_->can('get_all_values')) { | |
| 819 split( /[,;] */, join ";", $_->get_all_values) | |
| 820 } | |
| 821 elsif (ref $_ && $_->can('display_text')) { | |
| 822 split( /[,;] */, $_->display_text) | |
| 823 } | |
| 824 elsif (ref $_ && $_->can('value')) { | |
| 825 split( /[,;] */, $_->value) | |
| 826 } | |
| 827 else { | |
| 828 (); | |
| 829 } | |
| 830 } $seq->annotation->get_Annotations($atag); | |
| 831 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); } | |
| 832 } | |
| 833 | |
| 834 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name'); | |
| 835 #$sf->add_tag_value( Alias => $_ ) foreach(@genes); | |
| 836 # | |
| 837 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all | |
| 838 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink); | |
| 839 | |
| 840 return (wantarray)? ($source_type,$sf) : $source_type; #? | |
| 841 } | |
| 842 | |
| 843 | |
| 844 sub gene_features { | |
| 845 my ($f, $gene_id, $genelinkID) = @_; | |
| 846 local $_ = $f->primary_tag; | |
| 847 $method{$_}++; | |
| 848 | |
| 849 if ( /gene/ ) { | |
| 850 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!? | |
| 851 $tnum = $rnum= 0; $ncrna_id= $rna_id = ''; | |
| 852 return GM_NEW_TOPLEVEL; | |
| 853 | |
| 854 } elsif ( /mRNA/ ) { | |
| 855 return GM_NOT_PART unless $gene_id; | |
| 856 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
| 857 ($rna_id = $gene_id ) =~ s/gene/mRNA/; | |
| 858 $rna_id .= '.t0' . ++$tnum; | |
| 859 $f->add_tag_value( ID => $rna_id ); | |
| 860 $f->add_tag_value( Parent => $gene_id ); | |
| 861 | |
| 862 } elsif ( /RNA|transcript/) { | |
| 863 ## misc_RNA here; missing exons ... flattener problem? | |
| 864 # all of {t,nc,sn}RNA can have gene models now | |
| 865 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag | |
| 866 ## CDS needs to use mRNA, not misc_RNA, rna_id ... | |
| 867 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level | |
| 868 | |
| 869 if($gene_id) { | |
| 870 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
| 871 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/; | |
| 872 $ncrna_id .= '.r0' . ++$rnum; | |
| 873 $f->add_tag_value( Parent => $gene_id ); | |
| 874 $f->add_tag_value( ID => $ncrna_id ); | |
| 875 } else { | |
| 876 unless ($f->has_tag('ID')) { | |
| 877 if($genelinkID) { | |
| 878 $f->add_tag_value( ID => $genelinkID ) ; | |
| 879 } else { | |
| 880 $idh->generate_unique_persistent_id($f); | |
| 881 } | |
| 882 } | |
| 883 ($ncrna_id)= $f->get_tag_values('ID'); | |
| 884 return GM_NEW_TOPLEVEL; | |
| 885 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons? | |
| 886 } | |
| 887 | |
| 888 } elsif ( /exon/ ) { # can belong to any kind of RNA | |
| 889 return GM_NOT_PART unless ($rna_id||$ncrna_id); | |
| 890 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); | |
| 891 ## we are getting duplicate Parents here, which chokes chado loader, with reason... | |
| 892 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice | |
| 893 ## check all Parents | |
| 894 for my $expar ($rna_id, $ncrna_id) { | |
| 895 next unless($expar); | |
| 896 if ( $exonpar{$expar} and $f->has_tag('Parent') ) { | |
| 897 my @vals = $f->get_tag_values('Parent'); | |
| 898 next if (grep {$expar eq $_} @vals); | |
| 899 } | |
| 900 $exonpar{$expar}++; | |
| 901 $f->add_tag_value( Parent => $expar); | |
| 902 # last; #? could be both | |
| 903 } | |
| 904 # now we can skip cloned exons | |
| 905 # dgg note: multiple parents get added and printed for each unique exon | |
| 906 return GM_DUP_PART if ++$seen{$f} > 1; | |
| 907 | |
| 908 } elsif ( /CDS|protein|polypeptide/ ) { | |
| 909 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ?? | |
| 910 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #?? | |
| 911 (my $pro_id = $rna_id) =~ s/\.t/\.p/; | |
| 912 | |
| 913 if( ! $CDSkeep && /CDS/) { | |
| 914 $f->primary_tag($PROTEIN_TYPE); | |
| 915 | |
| 916 ## duplicate problem is Location .. | |
| 917 if ($f->location->isa("Bio::Location::SplitLocationI")) { | |
| 918 # my($b,$e)=($f->start, $f->end); # is this all we need? | |
| 919 my($b,$e)=(-1,0); | |
| 920 foreach my $l ($f->location->each_Location) { | |
| 921 $b = $l->start if($b<0 || $b > $l->start); | |
| 922 $e = $l->end if($e < $l->end); | |
| 923 } | |
| 924 $f->location( Bio::Location::Simple->new( | |
| 925 -start => $b, -end => $e, -strand => $f->strand) ); | |
| 926 } | |
| 927 | |
| 928 $f->add_tag_value( Derives_from => $rna_id ); | |
| 929 } | |
| 930 else { | |
| 931 $f->add_tag_value( Parent => $rna_id ); | |
| 932 } | |
| 933 | |
| 934 $f->add_tag_value( ID => $pro_id ); | |
| 935 | |
| 936 move_translation_fasta($f, $pro_id); | |
| 937 #if( $f->has_tag('translation')) { | |
| 938 # my ($aa) = $f->get_tag_values("translation"); | |
| 939 # $proteinfa{$pro_id}= $aa; | |
| 940 # $f->remove_tag("translation"); | |
| 941 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem | |
| 942 #} | |
| 943 } elsif ( /region/ ) { | |
| 944 $f->primary_tag('gene_component_region'); | |
| 945 $f->add_tag_value( Parent => $gene_id ); | |
| 946 } else { | |
| 947 return GM_NOT_PART unless $gene_id; | |
| 948 $f->add_tag_value( Parent => $gene_id ); | |
| 949 } | |
| 950 | |
| 951 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1; | |
| 952 | |
| 953 return GM_NEW_PART; | |
| 954 } | |
| 955 | |
| 956 ## was generic_features > add_generic_id | |
| 957 sub add_generic_id { | |
| 958 my ($f, $ft_name, $flags) = @_; | |
| 959 my $method = $f->primary_tag; | |
| 960 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above | |
| 961 | |
| 962 if ($f->has_tag('ID')) { | |
| 963 | |
| 964 } | |
| 965 elsif ( $f->has_tag($method) ) { | |
| 966 my ($name) = $f->get_tag_values($method); | |
| 967 $f->add_tag_value( ID => "$method:$name" ); | |
| 968 } | |
| 969 elsif($ft_name) { # is this unique ? | |
| 970 $f->add_tag_value( ID => $ft_name ); | |
| 971 } | |
| 972 else { | |
| 973 $idh->generate_unique_persistent_id($f); | |
| 974 } | |
| 975 | |
| 976 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] ) | |
| 977 if($method =~ /CDS/); | |
| 978 | |
| 979 # return $io->gff_string($f); | |
| 980 } | |
| 981 | |
| 982 sub move_translation_fasta { | |
| 983 my ($f, $ft_id) = @_; | |
| 984 if( $ft_id && $f->has_tag('translation') ) { | |
| 985 my ($aa) = $f->get_tag_values("translation"); | |
| 986 if($aa && $aa !~ /^length/) { | |
| 987 $proteinfa{$ft_id}= $aa; | |
| 988 $f->remove_tag("translation"); | |
| 989 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem | |
| 990 } | |
| 991 } | |
| 992 } | |
| 993 | |
| 994 sub gff_header { | |
| 995 my ($name, $end, $source_type, $source_feat) = @_; | |
| 996 $source_type ||= "region"; | |
| 997 | |
| 998 my $info = "$source_type:$name"; | |
| 999 my $head = "##gff-version $GFF_VERSION\n". | |
| 1000 "##sequence-region $name 1 $end\n". | |
| 1001 "# conversion-by bp_genbank2gff3.pl\n"; | |
| 1002 if ($source_feat) { | |
| 1003 ## dgg: these header comment fields are not useful when have multi-records, diff organisms | |
| 1004 for my $key (qw(organism Note date)) { | |
| 1005 my $value; | |
| 1006 if ($source_feat->has_tag($key)) { | |
| 1007 ($value) = $source_feat->get_tag_values($key); | |
| 1008 } | |
| 1009 if ($value) { | |
| 1010 $head .= "# $key $value\n"; | |
| 1011 $info .= ", $value"; | |
| 1012 } | |
| 1013 } | |
| 1014 $head = "" if $didheader; | |
| 1015 } else { | |
| 1016 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n"; | |
| 1017 } | |
| 1018 $didheader++; | |
| 1019 return (wantarray) ? ($head,$info) : $head; | |
| 1020 } | |
| 1021 | |
| 1022 sub unflatten_seq { | |
| 1023 my $seq = shift; | |
| 1024 | |
| 1025 ## print "# working on $source_type:", $seq->accession, "\n"; | |
| 1026 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number . | |
| 1027 ": consult STDERR\n"; | |
| 1028 | |
| 1029 eval { | |
| 1030 $unflattener->unflatten_seq( -seq => $seq, | |
| 1031 -noinfer => $noinfer, | |
| 1032 -use_magic => 1 ); | |
| 1033 }; | |
| 1034 | |
| 1035 # deal with unflattening errors | |
| 1036 if ( $@ ) { | |
| 1037 warn $seq->accession_number . " Unflattening error:\n"; | |
| 1038 warn "Details: $@\n"; | |
| 1039 print "# ".$uh_oh; | |
| 1040 } | |
| 1041 | |
| 1042 return 0 if !$seq || !$seq->all_SeqFeatures; | |
| 1043 | |
| 1044 # map feature types to the sequence ontology | |
| 1045 ## $tm->map_types_to_SO( -seq => $seq ); | |
| 1046 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg | |
| 1047 | |
| 1048 map_types( | |
| 1049 $tm, | |
| 1050 -seq => $seq, | |
| 1051 -type_map => $FTSOmap, | |
| 1052 -syn_map => $FTSOsynonyms, | |
| 1053 -undefined => "region" | |
| 1054 ); #nml | |
| 1055 | |
| 1056 } | |
| 1057 | |
| 1058 sub filter { | |
| 1059 my $seq = shift; | |
| 1060 ## return unless $filter; | |
| 1061 my @feats; | |
| 1062 my @sources; # dgg; pick source features here; only 1 always? | |
| 1063 if ($filter) { | |
| 1064 for my $f ( $seq->remove_SeqFeatures ) { | |
| 1065 my $m = $f->primary_tag; | |
| 1066 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ? | |
| 1067 push @feats, $f unless $filter =~ /$m/i; | |
| 1068 } | |
| 1069 $seq->add_SeqFeature($_) foreach @feats; | |
| 1070 } else { | |
| 1071 for my $f ( $seq->get_SeqFeatures ){ | |
| 1072 my $m = $f->primary_tag; | |
| 1073 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ? | |
| 1074 } | |
| 1075 } | |
| 1076 | |
| 1077 return @sources; | |
| 1078 } | |
| 1079 | |
| 1080 | |
| 1081 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures | |
| 1082 # changed to filter out cloned features. We have to implement the old | |
| 1083 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI | |
| 1084 sub get_all_SeqFeatures { | |
| 1085 my $seq = shift; | |
| 1086 my @flatarr; | |
| 1087 | |
| 1088 foreach my $feat ( $seq->get_SeqFeatures ){ | |
| 1089 push(@flatarr,$feat); | |
| 1090 _add_flattened_SeqFeatures(\@flatarr,$feat); | |
| 1091 } | |
| 1092 return @flatarr; | |
| 1093 } | |
| 1094 | |
| 1095 sub gene_name { | |
| 1096 my $g = shift; | |
| 1097 my $gene_id = ''; # zero it; | |
| 1098 | |
| 1099 if ($g->has_tag('locus_tag')) { | |
| 1100 ($gene_id) = $g->get_tag_values('locus_tag'); | |
| 1101 } | |
| 1102 | |
| 1103 elsif ($g->has_tag('gene')) { | |
| 1104 ($gene_id) = $g->get_tag_values('gene'); | |
| 1105 } | |
| 1106 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene | |
| 1107 ($gene_id) = $g->get_tag_values('ID'); | |
| 1108 } | |
| 1109 | |
| 1110 ## See Unflattener comment: | |
| 1111 # on rare occasions, records will have no /gene or /locus_tag | |
| 1112 # but it WILL have /product tags. These serve the same purpose | |
| 1113 # for grouping. For an example, see AY763288 (also in t/data) | |
| 1114 # eg. product=tRNA-Asp ; product=similar to crooked neck protein | |
| 1115 elsif ($g->has_tag('product')) { | |
| 1116 my ($name)= $g->get_tag_values('product'); | |
| 1117 ($gene_id) = $name unless($name =~ / /); # a description not name | |
| 1118 } | |
| 1119 | |
| 1120 ## dgg; also handle transposon=xxxx ID/name | |
| 1121 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746 | |
| 1122 elsif ($g->has_tag('transposon')) { | |
| 1123 my ($name)= $g->get_tag_values('transposon'); | |
| 1124 ($gene_id) = $name unless($name =~ / /); # a description not name | |
| 1125 } | |
| 1126 | |
| 1127 return $gene_id; | |
| 1128 } | |
| 1129 | |
| 1130 # same list as gene_name .. change tag to generic Name | |
| 1131 sub convert_to_name { | |
| 1132 my $g = shift; | |
| 1133 my $gene_id = ''; # zero it; | |
| 1134 | |
| 1135 if ($g->has_tag('gene')) { | |
| 1136 ($gene_id) = $g->get_tag_values('gene'); | |
| 1137 $g->remove_tag('gene'); | |
| 1138 $g->add_tag_value('Name', $gene_id); | |
| 1139 } | |
| 1140 elsif ($g->has_tag('locus_tag')) { | |
| 1141 ($gene_id) = $g->get_tag_values('locus_tag'); | |
| 1142 $g->remove_tag('locus_tag'); | |
| 1143 $g->add_tag_value('Name', $gene_id); | |
| 1144 } | |
| 1145 | |
| 1146 elsif ($g->has_tag('product')) { | |
| 1147 my ($name)= $g->get_tag_values('product'); | |
| 1148 ($gene_id) = $name unless($name =~ / /); # a description not name | |
| 1149 ## $g->remove_tag('product'); | |
| 1150 $g->add_tag_value('Name', $gene_id); | |
| 1151 } | |
| 1152 | |
| 1153 elsif ($g->has_tag('transposon')) { | |
| 1154 my ($name)= $g->get_tag_values('transposon'); | |
| 1155 ($gene_id) = $name unless($name =~ / /); # a description not name | |
| 1156 ## $g->remove_tag('transposon'); | |
| 1157 $g->add_tag_value('Name', $gene_id); | |
| 1158 } | |
| 1159 elsif ($g->has_tag('ID')) { | |
| 1160 my ($name)= $g->get_tag_values('ID'); | |
| 1161 $g->add_tag_value('Name', $name); | |
| 1162 } | |
| 1163 return $gene_id; | |
| 1164 } | |
| 1165 | |
| 1166 | |
| 1167 sub _add_flattened_SeqFeatures { | |
| 1168 my ($arrayref,$feat) = @_; | |
| 1169 my @subs = (); | |
| 1170 | |
| 1171 if ($feat->isa("Bio::FeatureHolderI")) { | |
| 1172 @subs = $feat->get_SeqFeatures; | |
| 1173 } | |
| 1174 elsif ($feat->isa("Bio::SeqFeatureI")) { | |
| 1175 @subs = $feat->sub_SeqFeature; | |
| 1176 } | |
| 1177 else { | |
| 1178 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ". | |
| 1179 "Don't know how to flatten."; | |
| 1180 } | |
| 1181 | |
| 1182 for my $sub (@subs) { | |
| 1183 push(@$arrayref,$sub); | |
| 1184 _add_flattened_SeqFeatures($arrayref,$sub); | |
| 1185 } | |
| 1186 | |
| 1187 } | |
| 1188 | |
| 1189 sub map_types { | |
| 1190 | |
| 1191 my ($self, @args) = @_; | |
| 1192 | |
| 1193 my($sf, $seq, $type_map, $syn_map, $undefmap) = | |
| 1194 $self->_rearrange([qw(FEATURE | |
| 1195 SEQ | |
| 1196 TYPE_MAP | |
| 1197 SYN_MAP | |
| 1198 UNDEFINED | |
| 1199 )], | |
| 1200 @args); | |
| 1201 | |
| 1202 if (!$sf && !$seq) { | |
| 1203 $self->throw("you need to pass in either -feature or -seq"); | |
| 1204 } | |
| 1205 | |
| 1206 my @sfs = ($sf); | |
| 1207 if ($seq) { | |
| 1208 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI"); | |
| 1209 @sfs = $seq->get_all_SeqFeatures; | |
| 1210 } | |
| 1211 $type_map = $type_map || $self->typemap; # dgg: was type_map; | |
| 1212 foreach my $feat (@sfs) { | |
| 1213 | |
| 1214 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI"); | |
| 1215 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI"); | |
| 1216 | |
| 1217 my $primary_tag = $feat->primary_tag; | |
| 1218 | |
| 1219 #if ($primary_tag =~ /^pseudo(.*)$/) { | |
| 1220 # $primary_tag = $1; | |
| 1221 # $feat->primary_tag($primary_tag); | |
| 1222 #} | |
| 1223 | |
| 1224 my $mtype = $type_map->{$primary_tag}; | |
| 1225 if ($mtype) { | |
| 1226 if (ref($mtype)) { | |
| 1227 if (ref($mtype) eq 'ARRAY') { | |
| 1228 my $soID; | |
| 1229 ($mtype, $soID) = @$mtype; | |
| 1230 | |
| 1231 if ($soID && ref($ONTOLOGY)) { | |
| 1232 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID); | |
| 1233 $mtype = $term->name if $term; | |
| 1234 } | |
| 1235 # if SO ID is undefined AND we have an ontology to search, we want to delete | |
| 1236 # the feature type hash entry in order to force a fuzzy search | |
| 1237 elsif (! defined $soID && ref($ONTOLOGY)) { | |
| 1238 undef $mtype; | |
| 1239 delete $type_map->{$primary_tag}; | |
| 1240 } | |
| 1241 elsif ($undefmap && $mtype eq 'undefined') { # dgg | |
| 1242 $mtype= $undefmap; | |
| 1243 } | |
| 1244 | |
| 1245 $type_map->{$primary_tag} = $mtype if $mtype; | |
| 1246 } | |
| 1247 elsif (ref($mtype) eq 'CODE') { | |
| 1248 $mtype = $mtype->($feat); | |
| 1249 } | |
| 1250 else { | |
| 1251 $self->throw('must be scalar or CODE ref'); | |
| 1252 } | |
| 1253 } | |
| 1254 elsif ($undefmap && $mtype eq 'undefined') { # dgg | |
| 1255 $mtype= $undefmap; | |
| 1256 } | |
| 1257 $feat->primary_tag($mtype); | |
| 1258 } | |
| 1259 | |
| 1260 if ($CONF) { | |
| 1261 conf_read(); | |
| 1262 my %perfect_matches; | |
| 1263 while (my ($p_tag,$rules) = each %$YAML) { | |
| 1264 RULE: | |
| 1265 for my $rule (@$rules) { | |
| 1266 for my $tags (@$rule) { | |
| 1267 while (my ($tag,$values) = each %$tags) { | |
| 1268 for my $value (@$values) { | |
| 1269 if ($feat->has_tag($tag)) { | |
| 1270 for ($feat->get_tag_values($tag)) { | |
| 1271 next RULE unless $_ =~ /\Q$value\E/; | |
| 1272 } | |
| 1273 } elsif ($tag eq 'primary_tag') { | |
| 1274 next RULE unless $value eq | |
| 1275 $feat->primary_tag; | |
| 1276 } elsif ($tag eq 'location') { | |
| 1277 next RULE unless $value eq | |
| 1278 $feat->start.'..'.$feat->end; | |
| 1279 } else { next RULE } | |
| 1280 } | |
| 1281 } | |
| 1282 } | |
| 1283 $perfect_matches{$p_tag}++; | |
| 1284 } | |
| 1285 } | |
| 1286 if (scalar(keys %perfect_matches) == 1) { | |
| 1287 $mtype = $_ for keys %perfect_matches; | |
| 1288 } elsif (scalar(keys %perfect_matches) > 1) { | |
| 1289 warn "There are conflicting rules in the config file for the" . | |
| 1290 " following types: "; | |
| 1291 warn "\t$_\n" for keys %perfect_matches; | |
| 1292 warn "Until conflict resolution is built into the converter," . | |
| 1293 " you will have to manually edit the config file to remove the" . | |
| 1294 " conflict. Sorry :(. Skipping user preference for this entry"; | |
| 1295 sleep(2); | |
| 1296 } | |
| 1297 } | |
| 1298 | |
| 1299 if ( ! $mtype && $syn_map) { | |
| 1300 if ($feat->has_tag('note')) { | |
| 1301 | |
| 1302 my @all_matches; | |
| 1303 | |
| 1304 my @note = $feat->each_tag_value('note'); | |
| 1305 | |
| 1306 for my $k (keys %$syn_map) { | |
| 1307 | |
| 1308 if ($k =~ /"(.+)"/) { | |
| 1309 | |
| 1310 my $syn = $1; | |
| 1311 | |
| 1312 for my $note (@note) { | |
| 1313 | |
| 1314 # look through the notes to see if the description | |
| 1315 # is an exact match for synonyms | |
| 1316 if ( $syn eq $note ) { | |
| 1317 | |
| 1318 my @map = @{$syn_map->{$k}}; | |
| 1319 | |
| 1320 | |
| 1321 my $best_guess = $map[0]; | |
| 1322 | |
| 1323 unshift @{$all_matches[-1]}, [$best_guess]; | |
| 1324 | |
| 1325 $mtype = $MANUAL | |
| 1326 ? manual_curation($feat, $best_guess, \@all_matches) | |
| 1327 : $best_guess; | |
| 1328 | |
| 1329 print '#' x 78 . "\nGuessing the proper SO term for GenBank" | |
| 1330 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n" | |
| 1331 . '#' x 78 . "\n\n"; | |
| 1332 | |
| 1333 } else { | |
| 1334 # check both primary tag and and note against | |
| 1335 # SO synonyms for best matching description | |
| 1336 | |
| 1337 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches); | |
| 1338 } | |
| 1339 | |
| 1340 } | |
| 1341 } | |
| 1342 } | |
| 1343 | |
| 1344 #unless ($mtype) { | |
| 1345 for my $note (@note) { | |
| 1346 for my $name (values %$type_map) { | |
| 1347 # check primary tag against SO names for best matching | |
| 1348 # descriptions //NML also need to check against | |
| 1349 # definition && camel case split terms | |
| 1350 | |
| 1351 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches); | |
| 1352 } | |
| 1353 } | |
| 1354 #} | |
| 1355 | |
| 1356 if (scalar(@all_matches) && !$mtype) { | |
| 1357 | |
| 1358 my $top_matches = first { defined $_ } @{$all_matches[-1]}; | |
| 1359 | |
| 1360 my $best_guess = $top_matches->[0]; | |
| 1361 | |
| 1362 | |
| 1363 | |
| 1364 # if guess has quotes, it is a synonym term. we need to | |
| 1365 # look up the corresponding name term | |
| 1366 # otherwise, guess is a name, so we can use it directly | |
| 1367 if ($best_guess =~ /"(.+)"/) { | |
| 1368 | |
| 1369 $best_guess = $syn_map->{$best_guess}->[0]; | |
| 1370 | |
| 1371 } | |
| 1372 | |
| 1373 @RETURN = @all_matches; | |
| 1374 $mtype = $MANUAL | |
| 1375 ? manual_curation($feat, $best_guess, \@all_matches) | |
| 1376 : $best_guess; | |
| 1377 | |
| 1378 print '#' x 78 . "\nGuessing the proper SO term for GenBank" | |
| 1379 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n" | |
| 1380 . '#' x 78 . "\n\n"; | |
| 1381 | |
| 1382 } | |
| 1383 } | |
| 1384 $mtype ||= $undefmap; | |
| 1385 $feat->primary_tag($mtype); | |
| 1386 } | |
| 1387 } | |
| 1388 | |
| 1389 | |
| 1390 } | |
| 1391 | |
| 1392 sub SO_fuzzy_match { | |
| 1393 | |
| 1394 my $candidate = shift; | |
| 1395 my $primary_tag = shift; | |
| 1396 my $note = shift; | |
| 1397 my $SO_terms = shift; | |
| 1398 my $best_matches_ref = shift; | |
| 1399 my $modifier = shift; | |
| 1400 | |
| 1401 $modifier ||= ''; | |
| 1402 | |
| 1403 my @feat_terms; | |
| 1404 | |
| 1405 for ( split(" |_", $primary_tag) ) { | |
| 1406 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; | |
| 1407 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g; | |
| 1408 push @feat_terms, @camelCase; | |
| 1409 } | |
| 1410 | |
| 1411 for ( split(" |_", $note) ) { | |
| 1412 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; | |
| 1413 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g; | |
| 1414 (my $word = $_) =~ s/[;:.,]//g; | |
| 1415 push @feat_terms, $word; | |
| 1416 } | |
| 1417 | |
| 1418 | |
| 1419 my @SO_terms = split(" |_", $SO_terms); | |
| 1420 | |
| 1421 # fuzzy match works on a simple point system. When 2 words match, | |
| 1422 # the $plus counter adds one. When they don't, the $minus counter adds | |
| 1423 # one. This is used to sort similar matches together. Better matches | |
| 1424 # are found at the end of the array, near the top. | |
| 1425 | |
| 1426 # NML: can we improve best match by using synonym tags | |
| 1427 # EXACT,RELATED,NARROW,BROAD? | |
| 1428 | |
| 1429 my ($plus, $minus) = (0, 0); | |
| 1430 my %feat_terms; | |
| 1431 my %SO_terms; | |
| 1432 | |
| 1433 #unique terms | |
| 1434 map {$feat_terms{$_} = 1} @feat_terms; | |
| 1435 map {$SO_terms{$_} = 1} @SO_terms; | |
| 1436 | |
| 1437 for my $st (keys %SO_terms) { | |
| 1438 for my $ft (keys %feat_terms) { | |
| 1439 | |
| 1440 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++; | |
| 1441 | |
| 1442 } | |
| 1443 } | |
| 1444 | |
| 1445 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus; | |
| 1446 | |
| 1447 } | |
| 1448 | |
| 1449 sub manual_curation { | |
| 1450 | |
| 1451 my ($feat, $default_opt, $all_matches) = @_; | |
| 1452 | |
| 1453 my @all_matches = @$all_matches; | |
| 1454 | |
| 1455 # convert all SO synonyms into names and filter | |
| 1456 # all matches into unique term list because | |
| 1457 # synonyms can map to multiple duplicate names | |
| 1458 | |
| 1459 my (@unique_SO_terms, %seen); | |
| 1460 for (reverse @all_matches) { | |
| 1461 for (@$_) { | |
| 1462 for (@$_) { | |
| 1463 #my @names; | |
| 1464 if ($_ =~ /"(.+)"/) { | |
| 1465 for (@{$SYN_MAP->{$_}}) { | |
| 1466 push @unique_SO_terms, $_ unless $seen{$_}; | |
| 1467 $seen{$_}++; | |
| 1468 } | |
| 1469 } else { | |
| 1470 push @unique_SO_terms, $_ unless $seen{$_}; | |
| 1471 $seen{$_}++; | |
| 1472 } | |
| 1473 } | |
| 1474 } | |
| 1475 } | |
| 1476 | |
| 1477 my $s = scalar(@unique_SO_terms); | |
| 1478 | |
| 1479 my $choice = 0; | |
| 1480 | |
| 1481 my $more = | |
| 1482 "[a]uto : automatic input (selects best guess for remaining entries)\r" . | |
| 1483 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" . | |
| 1484 "[i]nput : add a specific term\r" . | |
| 1485 "[r]eset : reset to the beginning of matches\r" . | |
| 1486 "[s]kip : skip this entry (selects best guess for this entry)\r" | |
| 1487 ; | |
| 1488 | |
| 1489 $more .= | |
| 1490 "[n]ext : view the next ".OPTION_CYCLE." terms\r" . | |
| 1491 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE); | |
| 1492 | |
| 1493 my $msg = #"\n\n" . '-' x 156 . "\n" | |
| 1494 "The converter found $s possible matches for the following GenBank entry: "; | |
| 1495 | |
| 1496 my $directions = | |
| 1497 "Type a number to select the SO term that best matches" | |
| 1498 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more"; | |
| 1499 | |
| 1500 | |
| 1501 # lookup filtered list to pull out definitions | |
| 1502 my @options = map { | |
| 1503 my $term = $_; | |
| 1504 my %term; | |
| 1505 for (['name', 'name'], ['def', 'definition'], ['synonym', | |
| 1506 'each_synonym']) { | |
| 1507 my ($label, $method) = @$_; | |
| 1508 $term{$label} = \@{[$term->$method]}; | |
| 1509 } | |
| 1510 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ]; | |
| 1511 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms; | |
| 1512 | |
| 1513 | |
| 1514 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions, | |
| 1515 $default_opt, @options); | |
| 1516 | |
| 1517 if ($option eq 'skip') { return $default_opt | |
| 1518 } elsif ($option eq 'auto') { | |
| 1519 $MANUAL = 0; | |
| 1520 return $default_opt; | |
| 1521 } else { return $option } | |
| 1522 | |
| 1523 } | |
| 1524 | |
| 1525 sub options_cycle { | |
| 1526 | |
| 1527 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_; | |
| 1528 | |
| 1529 #NML: really should only call GenBank_entry once. Will need to change | |
| 1530 #method to return array & shift off header | |
| 1531 my $entry = GenBank_entry($feat, "\r"); | |
| 1532 | |
| 1533 my $total = scalar(@opt); | |
| 1534 | |
| 1535 ($start,$stop) = (0, OPTION_CYCLE) | |
| 1536 if ( ($start < 0) && ($stop > 0) ); | |
| 1537 | |
| 1538 ($start,$stop) = (0, OPTION_CYCLE) | |
| 1539 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total); | |
| 1540 | |
| 1541 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0; | |
| 1542 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total; | |
| 1543 | |
| 1544 $stop = $total if $stop > $total; | |
| 1545 | |
| 1546 my $dir_copy = $directions; | |
| 1547 my $msg_copy = $msg; | |
| 1548 my $format = "format STDOUT = \n" . | |
| 1549 '-' x 156 . "\n" . | |
| 1550 '^' . '<' x 77 . '| Available Commands:' . "\n" . | |
| 1551 '$msg_copy' . "\n" . | |
| 1552 '-' x 156 . "\n" . | |
| 1553 ' ' x 78 . "|\n" . | |
| 1554 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" . | |
| 1555 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" . | |
| 1556 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" . | |
| 1557 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n"; | |
| 1558 | |
| 1559 { | |
| 1560 # eval throws redefined warning that breaks formatting. | |
| 1561 # Turning off warnings just for the eval to fix this. | |
| 1562 no warnings 'redefine'; | |
| 1563 eval $format; | |
| 1564 } | |
| 1565 | |
| 1566 write; | |
| 1567 | |
| 1568 print '-' x 156 . "\n" . | |
| 1569 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) . | |
| 1570 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" . | |
| 1571 "\n" . '-' x 156 . "\n"; | |
| 1572 | |
| 1573 for (my $i = $start; $i < $stop; $i+=2) { | |
| 1574 | |
| 1575 my ($left, $right) = @opt[$i,$i+1]; | |
| 1576 | |
| 1577 my ($nL, $nmL, $descL, $termL, @synL) = @$left; | |
| 1578 | |
| 1579 #odd numbered lists can cause fatal undefined errors, so check | |
| 1580 #to make sure we have data | |
| 1581 | |
| 1582 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef); | |
| 1583 | |
| 1584 | |
| 1585 my $format = "format STDOUT = \n"; | |
| 1586 | |
| 1587 $format .= | |
| 1588 ' ' x 78 . "|\n" . | |
| 1589 | |
| 1590 '@>>: name: ^' . '<' x 64 . '~' . ' |' . | |
| 1591 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" . | |
| 1592 '$nL,' . ' ' x 7 . '$nmL,' . | |
| 1593 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" . | |
| 1594 | |
| 1595 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' . | |
| 1596 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" . | |
| 1597 ' ' x 11 . '$nmL,' . | |
| 1598 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" . | |
| 1599 #' ' x 78 . '|' . "\n" . | |
| 1600 | |
| 1601 | |
| 1602 ' def: ^' . '<' x 65 . ' |' . | |
| 1603 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" . | |
| 1604 ' ' x 11 . '$descL,' . | |
| 1605 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" . | |
| 1606 | |
| 1607 | |
| 1608 (' ^' . '<' x 65 . ' |' . | |
| 1609 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" . | |
| 1610 ' ' x 11 . '$descL,' . | |
| 1611 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 . | |
| 1612 | |
| 1613 | |
| 1614 ' ^' . '<' x 61 . '...~ |' . | |
| 1615 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" . | |
| 1616 ' ' x 11 . '$descL,' . | |
| 1617 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" . | |
| 1618 | |
| 1619 ".\n"; | |
| 1620 | |
| 1621 { | |
| 1622 # eval throws redefined warning that breaks formatting. | |
| 1623 # Turning off warnings just for the eval to fix this. | |
| 1624 no warnings 'redefine'; | |
| 1625 eval $format; | |
| 1626 } | |
| 1627 write; | |
| 1628 | |
| 1629 } | |
| 1630 print '-' x 156 . "\nenter a command:"; | |
| 1631 | |
| 1632 while (<STDIN>) { | |
| 1633 | |
| 1634 (my $input = $_) =~ s/\s+$//; | |
| 1635 | |
| 1636 if ($input =~ /^\d+$/) { | |
| 1637 if ( $input && defined $opt[$input-1] ) { | |
| 1638 return $opt[$input-1]->[1] | |
| 1639 } else { | |
| 1640 print "\nThat number is not an option. Please enter a valid number.\n:"; | |
| 1641 } | |
| 1642 } elsif ($input =~ /^n/i | $input =~ /next/i ) { | |
| 1643 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg, | |
| 1644 $feat, $directions, $best_guess, @opt) | |
| 1645 } elsif ($input =~ /^p/i | $input =~ /prev/i ) { | |
| 1646 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg, | |
| 1647 $feat, $directions, $best_guess, @opt) | |
| 1648 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip' | |
| 1649 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto' | |
| 1650 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) { | |
| 1651 return manual_curation($feat, $best_guess, \@RETURN ); | |
| 1652 } elsif ( $input =~ /^f/i || $input =~ /find/i ) { | |
| 1653 | |
| 1654 my ($query, @query_results); | |
| 1655 | |
| 1656 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1; | |
| 1657 } else { | |
| 1658 | |
| 1659 #do a SO search | |
| 1660 print "Type your search query\n:"; | |
| 1661 while (<STDIN>) { chomp($query = $_); last } | |
| 1662 } | |
| 1663 | |
| 1664 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) { | |
| 1665 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)'); | |
| 1666 } | |
| 1667 | |
| 1668 return manual_curation($feat, $best_guess, \@query_results); | |
| 1669 | |
| 1670 } elsif ( $input =~ /^i/i || $input =~ /input/i ) { | |
| 1671 | |
| 1672 #NML fast input for later | |
| 1673 #my $query; | |
| 1674 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 }; | |
| 1675 | |
| 1676 #manual input | |
| 1677 print "Type the term you want to use\n:"; | |
| 1678 while (<STDIN>) { | |
| 1679 chomp(my $input = $_); | |
| 1680 | |
| 1681 if (! $TYPE_MAP->{$input}) { | |
| 1682 | |
| 1683 print "\"$input\" doesn't appear to be a valid SO term. Are ". | |
| 1684 "you sure you want to use it? (y or n)\n:"; | |
| 1685 | |
| 1686 while (<STDIN>) { | |
| 1687 | |
| 1688 chomp(my $choice = $_); | |
| 1689 | |
| 1690 if ($choice eq 'y') { | |
| 1691 print | |
| 1692 "\nWould you like to save your preference for " . | |
| 1693 "future use (so you don't have to redo manual " . | |
| 1694 "curation for this feature everytime you run " . | |
| 1695 "the converter)? (y or n)\n"; | |
| 1696 | |
| 1697 #NML: all these while loops are a mess. Really should condense it. | |
| 1698 while (<STDIN>) { | |
| 1699 | |
| 1700 chomp(my $choice = $_); | |
| 1701 | |
| 1702 if ($choice eq 'y') { | |
| 1703 curation_save($feat, $input); | |
| 1704 return $input; | |
| 1705 } elsif ($choice eq 'n') { | |
| 1706 return $input | |
| 1707 } else { | |
| 1708 print "\nDidn't recognize that command. Please " . | |
| 1709 "type y or n.\n:" | |
| 1710 } | |
| 1711 } | |
| 1712 | |
| 1713 | |
| 1714 } elsif ($choice eq 'n') { | |
| 1715 return options_cycle($start, $stop, $msg, $feat, | |
| 1716 $directions, $best_guess, @opt) | |
| 1717 } else { | |
| 1718 print "\nDidn't recognize that command. Please " . | |
| 1719 "type y or n.\n:" | |
| 1720 } | |
| 1721 } | |
| 1722 | |
| 1723 } else { | |
| 1724 print | |
| 1725 "\nWould you like to save your preference for " . | |
| 1726 "future use (so you don't have to redo manual " . | |
| 1727 "curation for this feature everytime you run " . | |
| 1728 "the converter)? (y or n)\n"; | |
| 1729 | |
| 1730 #NML: all these while loops are a mess. Really should condense it. | |
| 1731 while (<STDIN>) { | |
| 1732 | |
| 1733 chomp(my $choice = $_); | |
| 1734 | |
| 1735 if ($choice eq 'y') { | |
| 1736 curation_save($feat, $input); | |
| 1737 return $input; | |
| 1738 } elsif ($choice eq 'n') { | |
| 1739 return $input | |
| 1740 } else { | |
| 1741 print "\nDidn't recognize that command. Please " . | |
| 1742 "type y or n.\n:" | |
| 1743 } | |
| 1744 } | |
| 1745 | |
| 1746 } | |
| 1747 | |
| 1748 } | |
| 1749 } else { | |
| 1750 print "\nDidn't recognize that command. Please re-enter your choice.\n:" | |
| 1751 } | |
| 1752 } | |
| 1753 | |
| 1754 } | |
| 1755 | |
| 1756 sub GenBank_entry { | |
| 1757 my ($f, $delimiter, $num) = @_; | |
| 1758 | |
| 1759 $delimiter ||= "\n"; | |
| 1760 | |
| 1761 | |
| 1762 my $entry = | |
| 1763 | |
| 1764 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag | |
| 1765 . ($num | |
| 1766 ? ' ' x (12 - length $f->primary_tag ) . ' [2] ' | |
| 1767 : ' ' x (15 - length $f->primary_tag) | |
| 1768 ) | |
| 1769 . $f->start.'..'.$f->end | |
| 1770 | |
| 1771 . "$delimiter"; | |
| 1772 | |
| 1773 if ($num) { | |
| 1774 words_tag($f, \$entry); | |
| 1775 } else { | |
| 1776 for my $tag ($f->all_tags) { | |
| 1777 for my $val ( $f->each_tag_value($tag) ) { | |
| 1778 $entry .= ' ' x 20; | |
| 1779 #$entry .= "/$tag=\"$val\"$delimiter"; | |
| 1780 $entry .= $val eq '_no_value' | |
| 1781 ? "/$tag$delimiter" | |
| 1782 : "/$tag=\"$val\"$delimiter"; | |
| 1783 } | |
| 1784 } | |
| 1785 | |
| 1786 } | |
| 1787 | |
| 1788 return $entry; | |
| 1789 } | |
| 1790 | |
| 1791 | |
| 1792 sub gff_validate { | |
| 1793 warn "Validating GFF file\n" if $DEBUG; | |
| 1794 my @feat = @_; | |
| 1795 | |
| 1796 my (%parent2child, %all_ids, %descendants, %reserved); | |
| 1797 | |
| 1798 for my $f (@feat) { | |
| 1799 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) { | |
| 1800 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0]) | |
| 1801 if $f->has_tag($$aTags[0]); | |
| 1802 } | |
| 1803 } | |
| 1804 | |
| 1805 if ($SO_FILE) { | |
| 1806 while (my ($parentID, $aChildren) = each %parent2child) { | |
| 1807 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved); | |
| 1808 } | |
| 1809 } | |
| 1810 | |
| 1811 id_validate(\%all_ids, \%reserved); | |
| 1812 } | |
| 1813 | |
| 1814 sub parent_validate { | |
| 1815 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_; | |
| 1816 | |
| 1817 my $aParents = $hAll->{$parentID}; | |
| 1818 | |
| 1819 map { | |
| 1820 my $child = $_; | |
| 1821 $child->add_tag_value( validation_error => | |
| 1822 "feature tried to add Parent tag, but no Parent found with ID $parentID" | |
| 1823 ); | |
| 1824 my %parents; | |
| 1825 map { $parents{$_} = 1 } $child->get_tag_values('Parent'); | |
| 1826 delete $parents{$parentID}; | |
| 1827 my @parents = keys %parents; | |
| 1828 | |
| 1829 $child->remove_tag('Parent'); | |
| 1830 | |
| 1831 unless ($child->has_tag('ID')) { | |
| 1832 my $id = gene_name($child); | |
| 1833 $child->add_tag_value('ID', $id); | |
| 1834 push @{$hAll->{$id}}, $child | |
| 1835 } | |
| 1836 | |
| 1837 $child->add_tag_value('Parent', @parents) if @parents; | |
| 1838 | |
| 1839 } @$aChildren and return unless scalar(@$aParents); | |
| 1840 | |
| 1841 my $par = join(',', map { $_->primary_tag } @$aParents); | |
| 1842 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG; | |
| 1843 | |
| 1844 #NML manual curation needs to happen here | |
| 1845 | |
| 1846 | |
| 1847 my %parentsToRemove; | |
| 1848 | |
| 1849 CHILD: | |
| 1850 for my $child (@$aChildren) { | |
| 1851 my $childType = $child->primary_tag; | |
| 1852 | |
| 1853 warn "WORKING ON $childType at ".$child->start.' to '.$child->end | |
| 1854 if $DEBUG; | |
| 1855 | |
| 1856 for (my $i = 0; $i < scalar(@$aParents); $i++) { | |
| 1857 my $parent = $aParents->[$i]; | |
| 1858 my $parentType = $parent->primary_tag; | |
| 1859 | |
| 1860 warn "CHECKING $childType against $parentType" if $DEBUG; | |
| 1861 | |
| 1862 #cache descendants so we don't have to do repeat searches | |
| 1863 unless ($hDescendants->{$parentType}) { | |
| 1864 | |
| 1865 for my $term ($ONTOLOGY->find_terms( | |
| 1866 -name => $parentType | |
| 1867 ) ) { | |
| 1868 | |
| 1869 map { | |
| 1870 $hDescendants->{$parentType}{$_->name}++ | |
| 1871 } $ONTOLOGY->get_descendant_terms($term); | |
| 1872 | |
| 1873 } | |
| 1874 | |
| 1875 # NML: hopefully temporary fix. | |
| 1876 # SO doesn't consider exon/CDS to be a child of mRNA | |
| 1877 # even though common knowledge dictates that they are | |
| 1878 # This cheat fixes the false positives for now | |
| 1879 if ($parentType eq 'mRNA') { | |
| 1880 $hDescendants->{$parentType}{'exon'} = 1; | |
| 1881 $hDescendants->{$parentType}{'CDS'} = 1; | |
| 1882 } | |
| 1883 | |
| 1884 } | |
| 1885 | |
| 1886 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end . | |
| 1887 " be a child of $parentType?" if $DEBUG; | |
| 1888 if ($hDescendants->{$parentType}{$childType}) { | |
| 1889 warn "\tYES, $childType can be a child of $parentType" if $DEBUG; | |
| 1890 | |
| 1891 #NML need to deal with multiple children matched to multiple different | |
| 1892 #parents. This model only assumes the first parent id that matches a child will | |
| 1893 #be the reserved feature. | |
| 1894 | |
| 1895 $hReserved->{$parentID}{$parent}{'parent'} = $parent; | |
| 1896 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child; | |
| 1897 | |
| 1898 #mark parent for later removal from all IDs | |
| 1899 #so we don't accidentally change any parents | |
| 1900 | |
| 1901 $parentsToRemove{$i}++; | |
| 1902 | |
| 1903 next CHILD; | |
| 1904 } | |
| 1905 } | |
| 1906 | |
| 1907 | |
| 1908 | |
| 1909 #NML shouldn't have to check this; somehow child can lose Parent | |
| 1910 #it's happening W3110 | |
| 1911 #need to track this down | |
| 1912 if ( $child->has_tag('Parent') ) { | |
| 1913 | |
| 1914 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID" | |
| 1915 if $DEBUG; | |
| 1916 | |
| 1917 my %parents; | |
| 1918 | |
| 1919 map { $parents{$_} = 1 } $child->get_tag_values('Parent'); | |
| 1920 | |
| 1921 delete $parents{$parentID}; | |
| 1922 my @parents = keys %parents; | |
| 1923 | |
| 1924 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start . | |
| 1925 ' to ' . $child->end . " cannot be a child of ID $parentID" | |
| 1926 if $DEBUG; | |
| 1927 | |
| 1928 $child->add_tag_value( validation_error => | |
| 1929 "feature cannot be a child of $parentID"); | |
| 1930 | |
| 1931 $child->remove_tag('Parent'); | |
| 1932 | |
| 1933 unless ($child->has_tag('ID')) { | |
| 1934 my $id = gene_name($child); | |
| 1935 $child->add_tag_value('ID', $id); | |
| 1936 push @{$hAll->{$id}}, $child | |
| 1937 } | |
| 1938 | |
| 1939 $child->add_tag_value('Parent', @parents) if @parents; | |
| 1940 } | |
| 1941 | |
| 1942 } | |
| 1943 | |
| 1944 #delete $aParents->[$_] for keys %parentsToRemove; | |
| 1945 splice(@$aParents, $_, 1) for keys %parentsToRemove; | |
| 1946 } | |
| 1947 | |
| 1948 sub id_validate { | |
| 1949 my ($hAll, $hReserved) = @_; | |
| 1950 | |
| 1951 | |
| 1952 for my $id (keys %$hAll) { | |
| 1953 | |
| 1954 #since 1 feature can have this id, | |
| 1955 #let's just shift it off and uniquify | |
| 1956 #the rest (unless it's reserved) | |
| 1957 | |
| 1958 shift @{$hAll->{$id}} unless $hReserved->{$id}; | |
| 1959 for my $feat (@{$hAll->{$id}}) { | |
| 1960 id_uniquify(0, $id, $feat, $hAll); | |
| 1961 } | |
| 1962 } | |
| 1963 | |
| 1964 for my $parentID (keys %$hReserved) { | |
| 1965 | |
| 1966 my @keys = keys %{$hReserved->{$parentID}}; | |
| 1967 | |
| 1968 shift @keys; | |
| 1969 | |
| 1970 for my $k (@keys) { | |
| 1971 my $parent = $hReserved->{$parentID}{$k}{'parent'}; | |
| 1972 my $aChildren= $hReserved->{$parentID}{$k}{'children'}; | |
| 1973 | |
| 1974 my $value = id_uniquify(0, $parentID, $parent, $hAll); | |
| 1975 for my $child (@$aChildren) { | |
| 1976 | |
| 1977 my %parents; | |
| 1978 map { $parents{$_}++ } $child->get_tag_values('Parent'); | |
| 1979 $child->remove_tag('Parent'); | |
| 1980 delete $parents{$parentID}; | |
| 1981 $parents{$value}++; | |
| 1982 my @parents = keys %parents; | |
| 1983 $child->add_tag_value('Parent', @parents); | |
| 1984 } | |
| 1985 | |
| 1986 } | |
| 1987 } | |
| 1988 } | |
| 1989 | |
| 1990 sub id_uniquify { | |
| 1991 my ($count, $value, $feat, $hAll) = @_; | |
| 1992 | |
| 1993 warn "UNIQUIFYING $value" if $DEBUG; | |
| 1994 | |
| 1995 if (! $count) { | |
| 1996 $feat->add_tag_value(Alias => $value); | |
| 1997 $value .= ('.' . $feat->primary_tag) | |
| 1998 } elsif ($count == 1) { | |
| 1999 $value .= ".$count" | |
| 2000 } else { | |
| 2001 chop $value; | |
| 2002 $value .= $count | |
| 2003 } | |
| 2004 $count++; | |
| 2005 | |
| 2006 warn "ENDED UP WITH $value" if $DEBUG; | |
| 2007 if ( $hAll->{$value} ) { | |
| 2008 warn "$value IS ALREADY TAKEN" if $DEBUG; | |
| 2009 id_uniquify($count, $value, $feat, $hAll); | |
| 2010 } else { | |
| 2011 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end; | |
| 2012 $feat->remove_tag('ID'); | |
| 2013 $feat->add_tag_value('ID', $value); | |
| 2014 push @{$hAll->{$value}}, $value; | |
| 2015 } | |
| 2016 | |
| 2017 $value; | |
| 2018 } | |
| 2019 | |
| 2020 sub conf_read { | |
| 2021 | |
| 2022 print "\nCannot read $CONF. Change file permissions and retry, " . | |
| 2023 "or enter another file\n" and conf_locate() unless -r $CONF; | |
| 2024 | |
| 2025 print "\nCannot write $CONF. Change file permissions and retry, " . | |
| 2026 "or enter another file\n" and conf_locate() unless -w $CONF; | |
| 2027 | |
| 2028 $YAML = LoadFile($CONF); | |
| 2029 | |
| 2030 } | |
| 2031 | |
| 2032 sub conf_create { | |
| 2033 | |
| 2034 my ($path, $input) = @_; | |
| 2035 | |
| 2036 print "Cannot write to $path. Change directory permissions and retry " . | |
| 2037 "or enter another save path\n" and conf_locate() unless -w $path; | |
| 2038 | |
| 2039 $CONF = $input; | |
| 2040 | |
| 2041 open(FH, '>', $CONF); | |
| 2042 close(FH); | |
| 2043 conf_read(); | |
| 2044 | |
| 2045 | |
| 2046 } | |
| 2047 | |
| 2048 sub conf_write { DumpFile($CONF, $YAML) } | |
| 2049 | |
| 2050 sub conf_locate { | |
| 2051 | |
| 2052 print "\nEnter the location of a previously saved config, or a new " . | |
| 2053 "path and file name to create a new config (this step is " . | |
| 2054 "necessary to save any preferences)"; | |
| 2055 | |
| 2056 print "\n\nenter a command:"; | |
| 2057 | |
| 2058 while (<STDIN>) { | |
| 2059 chomp(my $input = $_); | |
| 2060 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/); | |
| 2061 | |
| 2062 if (-e $input && (! -d $input)) { | |
| 2063 | |
| 2064 print "\nReading $input...\n"; | |
| 2065 $CONF = $input; | |
| 2066 | |
| 2067 conf_read(); | |
| 2068 last; | |
| 2069 | |
| 2070 } elsif (! -d $input && $fn.$suffix) { | |
| 2071 | |
| 2072 print "Creating $input...\n"; | |
| 2073 conf_create($path, $input); | |
| 2074 last; | |
| 2075 | |
| 2076 } elsif (-e $input && -d $input) { | |
| 2077 print "You only entered a directory. " . | |
| 2078 "Please enter BOTH a directory and filename\n"; | |
| 2079 } else { | |
| 2080 print "$input does not appear to be a valid path. Please enter a " . | |
| 2081 "valid directory and filename\n"; | |
| 2082 } | |
| 2083 print "\nenter a command:"; | |
| 2084 } | |
| 2085 } | |
| 2086 | |
| 2087 sub curation_save { | |
| 2088 | |
| 2089 my ($feat, $input) = @_; | |
| 2090 | |
| 2091 #my $error = "Enter the location of a previously saved config, or a new " . | |
| 2092 # "path and file name to create a new config (this step is " . | |
| 2093 # "necessary to save any preferences)\n"; | |
| 2094 | |
| 2095 if (!$CONF) { | |
| 2096 print "\n\n"; | |
| 2097 conf_locate(); | |
| 2098 } elsif (! -e $CONF) { | |
| 2099 print "\n\nThe config file you have chosen doesn't exist.\n"; | |
| 2100 conf_locate(); | |
| 2101 } else { conf_read() } | |
| 2102 | |
| 2103 my $entry = GenBank_entry($feat, "\r", 1); | |
| 2104 | |
| 2105 my $msg = "Term entered: $input"; | |
| 2106 my $directions = "Please select any/all tags that provide evidence for the term you | |
| 2107 have entered. You may enter multiple tags by separating them by commas/dashes | |
| 2108 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have | |
| 2109 the option of either selecting the entire note as evidence, or specific | |
| 2110 keywords. If a tag has multiple keywords, they will be tagged alphabetically | |
| 2111 for selection. To select a specific keyword in a tag field, you must enter the | |
| 2112 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be | |
| 2113 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have | |
| 2114 to be to match your curation. To match the GenBank entry exactly as it | |
| 2115 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your | |
| 2116 preference, you will no longer be prompted to choose a feature type for any | |
| 2117 matching entries until you edit the curation.ini file."; | |
| 2118 my $msg_copy = $msg; | |
| 2119 my $dir_copy = $directions; | |
| 2120 | |
| 2121 my $format = "format STDOUT = \n" . | |
| 2122 '-' x 156 . "\n" . | |
| 2123 '^' . '<' x 77 . '| Directions:' . "\n" . | |
| 2124 '$msg_copy' . "\n" . | |
| 2125 '-' x 156 . "\n" . | |
| 2126 ' ' x 78 . "|\n" . | |
| 2127 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" . | |
| 2128 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" . | |
| 2129 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" . | |
| 2130 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n"; | |
| 2131 | |
| 2132 { | |
| 2133 # eval throws redefined warning that breaks formatting. | |
| 2134 # Turning off warnings just for the eval to fix this. | |
| 2135 no warnings 'redefine'; | |
| 2136 eval $format; | |
| 2137 } | |
| 2138 | |
| 2139 write; | |
| 2140 print '-' x 156 . "\nenter a command:"; | |
| 2141 | |
| 2142 my @tags = words_tag($feat); | |
| 2143 | |
| 2144 my $final = {}; | |
| 2145 my $choices; | |
| 2146 while (<STDIN>) { | |
| 2147 | |
| 2148 chomp(my $choice = $_); | |
| 2149 | |
| 2150 if (scalar(keys %$final) && $choice =~ /^y/i) { last | |
| 2151 | |
| 2152 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input) | |
| 2153 | |
| 2154 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n"; | |
| 2155 | |
| 2156 } elsif ($choice eq 'all') { | |
| 2157 | |
| 2158 $choice = ''; | |
| 2159 for (my $i=1; $i < scalar(@tags); $i++) { | |
| 2160 $choice .= "$i,"; | |
| 2161 } | |
| 2162 chop $choice; | |
| 2163 } | |
| 2164 #print "CHOICE [$choice]"; | |
| 2165 | |
| 2166 my @selections; | |
| 2167 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) { | |
| 2168 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) { | |
| 2169 | |
| 2170 for ($1..$2) { push @selections, $_ } | |
| 2171 | |
| 2172 my $dangling_alphas = $3; | |
| 2173 alpha_expand($dangling_alphas, \@selections); | |
| 2174 | |
| 2175 } else { | |
| 2176 alpha_expand($_, \@selections); | |
| 2177 } | |
| 2178 } | |
| 2179 | |
| 2180 foreach my $numbers (@selections) { | |
| 2181 | |
| 2182 my @c = split(/(?=[\w])/, $numbers); | |
| 2183 s/\W+//g foreach @c; | |
| 2184 my $num; | |
| 2185 | |
| 2186 { | |
| 2187 $^W = 0; | |
| 2188 $num = 0 + shift @c; | |
| 2189 } | |
| 2190 | |
| 2191 my $tag = $tags[$num]; | |
| 2192 if (ref $tag && scalar(@c)) { | |
| 2193 my $no_value; | |
| 2194 foreach (@c) { | |
| 2195 if (defined $tag->{$_}) { | |
| 2196 $choices .= "${num}[$_] "; | |
| 2197 my ($t,$v) = @{$tag->{$_}}; | |
| 2198 push @{${$final->{$input}}[0]{$t}}, $v; | |
| 2199 | |
| 2200 } else { $no_value++ } | |
| 2201 } | |
| 2202 | |
| 2203 if ($no_value) { | |
| 2204 _selection_add($tag,$final,$input,\$choices,$num); | |
| 2205 #my ($t,$v) = @{$tag->{'all'}}; | |
| 2206 #unless (defined ${$final->{$input}}[0]{$t}) { | |
| 2207 #$choices .= "$num, "; | |
| 2208 #push @{${$final->{$input}}[0]{$t}}, $v | |
| 2209 #} | |
| 2210 } | |
| 2211 | |
| 2212 $choices = substr($choices, 0, -2); | |
| 2213 $choices .= ', '; | |
| 2214 | |
| 2215 } elsif (ref $tag) { | |
| 2216 _selection_add($tag,$final,$input,\$choices,$num); | |
| 2217 #my ($t,$v) = @{$tag->{'all'}}; | |
| 2218 #unless (defined ${$final->{$input}}[0]{$t}) { | |
| 2219 #$choices .= "$num, "; | |
| 2220 #push @{${$final->{$input}}[0]{$t}}, $v | |
| 2221 #} | |
| 2222 } | |
| 2223 } | |
| 2224 $choices = substr($choices, 0, -2) if $choices; | |
| 2225 if ($final) { | |
| 2226 print "\nYou have chosen the following tags:\n$choices\n"; | |
| 2227 print "This will be written to the config file as:\n"; | |
| 2228 print Dump $final; | |
| 2229 print "\nIs this correct? (y or n)\n"; | |
| 2230 } else { print "\nInvalid selection. Please try again\n" } | |
| 2231 } | |
| 2232 push @{$YAML->{$input}}, $final->{$input}; | |
| 2233 conf_write(); | |
| 2234 } | |
| 2235 | |
| 2236 # words_tag() splits each tag value string into multiple words so that the | |
| 2237 # user can select the parts he/she wants to use for curation | |
| 2238 # it can tag 702 (a - zz) separate words; this should be enough | |
| 2239 | |
| 2240 sub words_tag { | |
| 2241 | |
| 2242 my ($feat, $entry) = @_; | |
| 2243 | |
| 2244 my @tags; | |
| 2245 | |
| 2246 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]}); | |
| 2247 my $i = 3; | |
| 2248 foreach my $tag ($feat->all_tags) { | |
| 2249 foreach my $value ($feat->each_tag_value($tag)) { | |
| 2250 | |
| 2251 my ($string, $tagged_string); | |
| 2252 | |
| 2253 my @words = split(/(?=\w+?)/, $value); | |
| 2254 | |
| 2255 my $pos = 0; | |
| 2256 | |
| 2257 | |
| 2258 foreach my $word (@words) { | |
| 2259 | |
| 2260 (my $sanitized_word = $word) =~ s/\W+?//g; | |
| 2261 $string .= $word; | |
| 2262 | |
| 2263 my $lead = int($pos/ALPHABET_DIVISOR); | |
| 2264 my $lag = $pos % ALPHABET_DIVISOR; | |
| 2265 | |
| 2266 my $a = $lead ? ${(ALPHABET)}[$lead-1] : ''; | |
| 2267 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a'; | |
| 2268 | |
| 2269 $tagged_string .= " ($a) $word"; | |
| 2270 | |
| 2271 $tags[$i]{$a} = [$tag, $sanitized_word]; | |
| 2272 $pos++; | |
| 2273 } | |
| 2274 | |
| 2275 $value = $tagged_string if scalar(@words) > 1; | |
| 2276 | |
| 2277 $$entry .= "[$i] /$tag=\"$value\"\r"; | |
| 2278 | |
| 2279 $tags[$i]{'all'} = [$tag, $string]; | |
| 2280 } | |
| 2281 $i++; | |
| 2282 } | |
| 2283 | |
| 2284 return @tags; | |
| 2285 | |
| 2286 } | |
| 2287 | |
| 2288 sub alpha_expand { | |
| 2289 | |
| 2290 my ($dangling_alphas, $selections) = @_; | |
| 2291 | |
| 2292 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) { | |
| 2293 | |
| 2294 my $digit = $1; | |
| 2295 push @$selections, $digit if $digit; | |
| 2296 | |
| 2297 my $start = $2; | |
| 2298 my $stop = $3; | |
| 2299 | |
| 2300 my @starts = split('', $start); | |
| 2301 my @stops = split('', $stop); | |
| 2302 | |
| 2303 my ($final_start, $final_stop); | |
| 2304 | |
| 2305 for ([\$final_start, \@starts], [\$final_stop, \@stops]) { | |
| 2306 | |
| 2307 my ($final, $splits) = @$_; | |
| 2308 | |
| 2309 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]}; | |
| 2310 my $rem; | |
| 2311 | |
| 2312 | |
| 2313 if ($$splits[1]) { | |
| 2314 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]}; | |
| 2315 $int++ | |
| 2316 } else { | |
| 2317 $rem = $int; | |
| 2318 $int = 0; | |
| 2319 } | |
| 2320 | |
| 2321 | |
| 2322 $$final = $int * ALPHABET_DIVISOR; | |
| 2323 $$final += $rem; | |
| 2324 | |
| 2325 } | |
| 2326 | |
| 2327 my $last_number = pop @$selections; | |
| 2328 for my $pos ($final_start..$final_stop) { | |
| 2329 my $lead = int($pos/ALPHABET_DIVISOR); | |
| 2330 my $lag = $pos % ALPHABET_DIVISOR; | |
| 2331 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : ''; | |
| 2332 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a'; | |
| 2333 push @$selections, $last_number.$alpha; | |
| 2334 } | |
| 2335 | |
| 2336 } elsif (defined($dangling_alphas)) { | |
| 2337 if ($dangling_alphas =~ /^\d/) { | |
| 2338 push @$selections, $dangling_alphas; | |
| 2339 } elsif ($dangling_alphas =~ /^\D/) { | |
| 2340 #print "$dangling_alphas ".Dumper @$selections; | |
| 2341 my $last_number = pop @$selections; | |
| 2342 $last_number ||= ''; | |
| 2343 push @$selections, $last_number.$dangling_alphas; | |
| 2344 #$$selections[-1] .= $dangling_alphas; | |
| 2345 } | |
| 2346 } | |
| 2347 | |
| 2348 } | |
| 2349 | |
| 2350 sub _selection_add { | |
| 2351 | |
| 2352 my ($tag, $final, $input, $choices, $num) = @_; | |
| 2353 my ($t,$v) = @{$tag->{'all'}}; | |
| 2354 unless (defined ${$final->{$input}}[0]{$t}) { | |
| 2355 $$choices .= "$num, "; | |
| 2356 push @{${$final->{$input}}[0]{$t}}, $v | |
| 2357 } | |
| 2358 | |
| 2359 } |
