annotate corebio/seq_io/table_io.py @ 14:778f03497adb

Uploaded
author davidmurphy
date Fri, 24 Feb 2012 11:37:26 -0500
parents c55bdc2fb9fa
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
1 #!/usr/bin/env python
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
2
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
4 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
5 # This software is distributed under the MIT Open Source License.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
6 # <http://www.opensource.org/licenses/mit-license.html>
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
7 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
8 # Permission is hereby granted, free of charge, to any person obtaining a
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
9 # copy of this software and associated documentation files (the "Software"),
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
10 # to deal in the Software without restriction, including without limitation
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
12 # and/or sell copies of the Software, and to permit persons to whom the
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
13 # Software is furnished to do so, subject to the following conditions:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
14 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
15 # The above copyright notice and this permission notice shall be included
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
16 # in all copies or substantial portions of the Software.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
17 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
24 # THE SOFTWARE.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
25 #
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
26
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
27 """Read and write sequence information in tab delimited format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
28
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
29 This very simple format has two columns per line. The first column is a sequence name, the second column is the sequence itself. The columns are separated by a single tab ("\\t") character.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
30
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
31 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
32 from corebio.utils import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
33 from corebio.seq import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
34 from corebio.seq_io import *
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
35
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
36
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
37 names = ( 'table', 'tab')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
38 extensions = ('tbl')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
39
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
40
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
41 example = """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
42 EC0001 MKRISTTITTTITITTGNGAG
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
43 EC0002 MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
44 EC0003 MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLG
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
45 EC0004 MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEMLKLD
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
46 EC0005 MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYWDGGH
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
47 EC0006 MLILISPAKTLDYQSPLTTTRYTLPELLDNSQQLIHEARKLTPPQISTLM
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
48 EC0007 MPDFFSFINSVLWGSVMIYLLFGAGCWFTFRTGFVQFRYIRQFGKSLKNS
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
49 EC0008 MTDKLTSLRQYTTVVADTGDIAAMKLYQPQDATTNPSLILNAAQIPEYRK
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
50 EC0009 MNTLRIGLVSISDRASSGVYQDKGIPALEEWLTSALTTPFELETRLIPDE
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
51 EC0010 MGNTKLANPAPLGLMGFGMTTILLNLHNVGYFALDGIILAMGIFYGGIAQ
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
52 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
53
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
54
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
55
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
56
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
57 def read(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
58 """Read and parse file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
59
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
60 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
61 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
62 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
63 Returns:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
64 SeqList -- A list of sequences
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
65 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
66 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
67 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
68 seqs = [ s for s in iterseq(fin, alphabet)]
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
69 return SeqList(seqs)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
70
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
71
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
72 def iterseq(fin, alphabet=None):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
73 """ Parse a file and generate sequences.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
74
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
75 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
76 fin -- A stream or file to read
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
77 alphabet -- The expected alphabet of the data, if given
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
78 Yeilds:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
79 Seq -- One alphabetic sequence at a time.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
80 Raises:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
81 ValueError -- If the file is unparsable
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
82 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
83 alphabet = Alphabet(alphabet)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
84
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
85 for lineno, line in enumerate(fin) :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
86 line = line.strip()
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
87 if line == '' : continue
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
88
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
89 columns = line.split('\t')
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
90 if len(columns) !=2 :
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
91 raise ValueError( "Parse failed on line %d: did not find two "
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
92 "columns seperated by a tab." % (lineno) )
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
93 yield Seq(columns[1], alphabet=alphabet, name=columns[0])
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
94
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
95
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
96 def write(fout, seqs):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
97 """Write a two column, tab delineated file.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
98
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
99 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
100 fout -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
101 seqs -- A list of Seq's
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
102 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
103 for s in seqs : writeseq(fout, s)
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
104
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
105
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
106 def writeseq(fout, seq):
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
107 """ Write a single sequence in fasta format.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
108
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
109 Args:
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
110 afile -- A writable stream.
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
111 seq -- A Seq instance
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
112 """
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
113
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
114 name = seq.name or ''
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
115 print >>fout, name, '\t', seq
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
116
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
117
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
118
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
119
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
120
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
121
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
122
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
123
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
124
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
125
c55bdc2fb9fa Uploaded
davidmurphy
parents:
diff changeset
126