comparison kmersvm/scripts/libkmersvm.py @ 0:7fe1103032f7 draft

Uploaded
author cafletezbrant
date Mon, 20 Aug 2012 18:07:22 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7fe1103032f7
1 """
2 libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py
3 Copyright (C) 2011 Dongwon Lee
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 """
18
19 import sys
20 import os
21 import os.path
22 import optparse
23
24 from bitarray import bitarray
25
26 def bitarray_fromfile(filename):
27 """
28 """
29 fh = open(filename, 'rb')
30 bits = bitarray()
31 bits.fromfile(fh)
32
33 return bits, fh
34
35 def generate_kmers(kmerlen):
36 """make a full list of k-mers
37
38 Arguments:
39 kmerlen -- integer, length of k-mer
40
41 Return:
42 a list of the full set of k-mers
43 """
44
45 nts = ['A', 'C', 'G', 'T']
46 kmers = []
47 kmers.append('')
48 l = 0
49 while l < kmerlen:
50 imers = []
51 for imer in kmers:
52 for nt in nts:
53 imers.append(imer+nt)
54 kmers = imers
55 l += 1
56
57 return kmers
58
59
60 def revcomp(seq):
61 """get reverse complement DNA sequence
62
63 Arguments:
64 seq -- string, DNA sequence
65
66 Return:
67 the reverse complement sequence of the given sequence
68 """
69 rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'}
70 return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)])
71
72
73 def generate_rcmap_table(kmerlen, kmers):
74 """make a lookup table for reverse complement k-mer ids for speed
75
76 Arguments:
77 kmerlen -- integer, length of k-mer
78 kmers -- list, a full set of k-mers generated by generate_kmers
79
80 Return:
81 a dictionary containing the mapping table
82 """
83 revcomp_func = revcomp
84
85 kmer_id_dict = {}
86 for i in xrange(len(kmers)):
87 kmer_id_dict[kmers[i]] = i
88
89 revcomp_mapping_table = []
90 for kmerid in xrange(len(kmers)):
91 rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])]
92 if rc_id < kmerid:
93 revcomp_mapping_table.append(rc_id)
94 else:
95 revcomp_mapping_table.append(kmerid)
96
97 return revcomp_mapping_table
98
99
100 def read_fastafile(filename, subs=True):
101 """Read sequences from a file in FASTA format
102
103 Arguments:
104 filename -- string, the name of the sequence file in FASTA format
105 subs -- bool, substitute 'N' with 'A' if set true
106
107 Return:
108 list of sequences, list of sequence ids
109 """
110
111 sids = []
112 seqs = []
113
114 try:
115 f = open(filename, 'r')
116 lines = f.readlines()
117 f.close()
118
119 except IOError, (errno, strerror):
120 print "I/O error(%d): %s" % (errno, strerror)
121 sys.exit(0)
122
123 seq = []
124 for line in lines:
125 if line[0] == '>':
126 sids.append(line[1:].rstrip('\n').split()[0])
127 if seq != []: seqs.append("".join(seq))
128 seq = []
129 else:
130 if subs:
131 seq.append(line.rstrip('\n').upper().replace('N', 'A'))
132 else:
133 seq.append(line.rstrip('\n').upper())
134
135 if seq != []:
136 seqs.append("".join(seq))
137
138 return seqs, sids