Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq_io/__init__.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq_io/__init__.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,261 @@ + +# Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> +# Copyright (c) 2006, The Regents of the University of California, through +# Lawrence Berkeley National Laboratory (subject to receipt of any required +# approvals from the U.S. Dept. of Energy). All rights reserved. + +# This software is distributed under the new BSD Open Source License. +# <http://www.opensource.org/licenses/bsd-license.html> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# (1) Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# (2) Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and or other materials provided with the distribution. +# +# (3) Neither the name of the University of California, Lawrence Berkeley +# National Laboratory, U.S. Dept. of Energy nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + + + + +""" Sequence file reading and writing. + +Biological sequence data is stored and transmitted using a wide variety of +different file formats. This package provides convient methods to read and +write several of these file fomats. + +CoreBio is often capable of guessing the correct file type, either from the +file extension or the structure of the file: +>>> import corebio.seq_io +>>> afile = open("test_corebio/data/cap.fa") +>>> seqs = corebio.seq_io.read(afile) + +Alternatively, each sequence file type has a seperate module named FILETYPE_io +(e.g. fasta_io, clustal_io). +>>> import corebio.seq_io.fasta_io +>>> afile = open("test_corebio/data/cap.fa") +>>> seqs = corebio.seq_io.fasta_io.read( afile ) + +Sequence data can also be written back to files: +>>> fout = open("out.fa", "w") +>>> corebio.seq_io.fasta_io.write( fout, seqs ) + + +Supported File Formats +---------------------- + +Module Name Extension read write features +--------------------------------------------------------------------------- +array_io array, flatfile yes yes none +clustal_io clustalw aln yes yes +fasta_io fasta, Pearson fa yes yes none +genbank_io genbank gb yes +intelligenetics_io intelligenetics ig yes yes +msf_io msf msf yes +nbrf_io nbrf, pir pir yes +nexus_io nexus nexus yes +phylip_io phylip phy yes +plain_io plain, raw txt yes yes none +table_io table tbl yes yes none + +Each IO module defines one or more of the following functions and variables: + +read(afile, alphabet=None) + Read a file of sequence data and return a SeqList, a collection + of Seq's (Alphabetic strings) and features. + +read_seq(afile, alphabet=None) + Read a single sequence from a file. + +iter_seq(afile, alphabet =None) + Iterate over the sequences in a file. + +index(afile, alphabet = None) + Instead of loading all of the sequences into memory, scan the file and + return an index map that will load sequences on demand. Typically not + implemented for formats with interleaved sequences. + +write(afile, seqlist) + Write a collection of sequences to the specifed file. + +write_seq(afile, seq) + Write one sequence to the file. Only implemented for non-inteleaved, + headerless formats, such as fasta and plain. + +example + A string containing a short example of the file format + +names + A list of synonyms for the file format. e.g. for fasta_io, ( 'fasta', + 'pearson', 'fa'). The first entry is the preferred format name. + +extensions + A list of file name extensions used for this file format. e.g. + fasta_io.extensions is ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt', + 'aa','fna','mpfa'). The preferred or standard extension is first in the + list. + + +Attributes : +- formats -- Available seq_io format parsers +- format_names -- A map between format names and format parsers. +- format_extensions -- A map between filename extensions and parsers. + +""" + +# Dev. References : +# +# - http://iubio.bio.indiana.edu/soft/molbio/readseq/java/Readseq2-help.html +# - http://www.ebi.ac.uk/help/formats_frame.html +# - http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html +# - http://bioperl.org/HOWTOs/html/SeqIO.html +# - http://emboss.sourceforge.net/docs/themes/SequenceFormats.html +# - http://www.cse.ucsc.edu/research/compbio/a2m-desc.html (a2m) +# - http://www.genomatix.de/online_help/help/sequence_formats.html + +from corebio.seq import * + +import clustal_io +import fasta_io +import msf_io +import nbrf_io +import nexus_io +import plain_io +import phylip_io +#import null_io +import stockholm_io +import intelligenetics_io +import table_io +import array_io +import genbank_io + +__all__ = [ + 'clustal_io', + 'fasta_io', + 'msf_io', + 'nbrf_io', + 'nexus_io', + 'plain_io', + 'phylip_io', + 'null_io', + 'stockholm_io', + 'intelligenetics_io', + 'table_io', + 'array_io', + 'genbank_io', + 'read', + 'formats', + 'format_names', + 'format_extensions', + ] + +formats = ( clustal_io, fasta_io, plain_io, msf_io, genbank_io,nbrf_io, nexus_io, phylip_io, stockholm_io, intelligenetics_io, table_io, array_io) +"""Available seq_io formats""" + + +def format_names() : + """Return a map between format names and format modules""" + global formats + fnames = {} + for f in formats : + for name in f.names : + assert name not in fnames # Insanity check + fnames[name] = f + return fnames + +def format_extensions() : + """Return a map between filename extensions and sequence file types""" + global formats + fext = {} + for f in formats : + for ext in f.extensions : + assert ext not in fext # Insanity check + fext[ext] = f + return fext + + +# seq_io._parsers is an ordered list of sequence parsers that are tried, in +# turn, on files of unknown format. Each parser must raise an exception when +# fed a format further down the list. +# +# The general trend is most common to least common file format. However, +# 'nbrf_io' is before 'fasta_io' because nbrf looks like fasta with extras, and +# 'array_io' is last, since it is very general. +_parsers = (nbrf_io, fasta_io, clustal_io, phylip_io, genbank_io, stockholm_io, msf_io, nexus_io, table_io, array_io) + + +def _get_parsers(fin) : + global _parsers + + fnames = format_names() + fext = format_extensions() + parsers = list(_parsers) + best_guess = parsers[0] + + # If a filename is supplied use the extension to guess the format. + if hasattr(fin, "name") and '.' in fin.name : + extension = fin.name.split('.')[-1] + if extension in fnames: + best_guess = fnames[extension] + elif extension in fext : + best_guess = fext[extension] + + if best_guess in parsers : + parsers.remove(best_guess) + parsers.insert(0,best_guess) + + return parsers + + + +def read(fin, alphabet=None) : + """ Read a sequence file and attempt to guess its format. + First the filename extension (if available) is used to infer the format. + If that fails, then we attempt to parse the file using several common + formats. + + returns : + SeqList + raises : + ValueError - If the file cannot be parsed. + ValueError - Sequence do not conform to the alphabet. + """ + + alphabet = Alphabet(alphabet) + parsers = _get_parsers(fin) + + for p in _get_parsers(fin) : + try: + return p.read(fin, alphabet) + except ValueError: + pass + fin.seek(0) # FIXME. Non seakable stdin? + + names = ", ".join([ p.names[0] for p in parsers]) + raise ValueError("Cannot parse sequence file: Tried %s " % names) + + + + + +