Mercurial > repos > cafletezbrant > kmersvm
comparison kmersvm/scripts/libkmersvm.py @ 0:7fe1103032f7 draft
Uploaded
author | cafletezbrant |
---|---|
date | Mon, 20 Aug 2012 18:07:22 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7fe1103032f7 |
---|---|
1 """ | |
2 libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py | |
3 Copyright (C) 2011 Dongwon Lee | |
4 | |
5 This program is free software: you can redistribute it and/or modify | |
6 it under the terms of the GNU General Public License as published by | |
7 the Free Software Foundation, either version 3 of the License, or | |
8 (at your option) any later version. | |
9 | |
10 This program is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 You should have received a copy of the GNU General Public License | |
16 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
17 """ | |
18 | |
19 import sys | |
20 import os | |
21 import os.path | |
22 import optparse | |
23 | |
24 from bitarray import bitarray | |
25 | |
26 def bitarray_fromfile(filename): | |
27 """ | |
28 """ | |
29 fh = open(filename, 'rb') | |
30 bits = bitarray() | |
31 bits.fromfile(fh) | |
32 | |
33 return bits, fh | |
34 | |
35 def generate_kmers(kmerlen): | |
36 """make a full list of k-mers | |
37 | |
38 Arguments: | |
39 kmerlen -- integer, length of k-mer | |
40 | |
41 Return: | |
42 a list of the full set of k-mers | |
43 """ | |
44 | |
45 nts = ['A', 'C', 'G', 'T'] | |
46 kmers = [] | |
47 kmers.append('') | |
48 l = 0 | |
49 while l < kmerlen: | |
50 imers = [] | |
51 for imer in kmers: | |
52 for nt in nts: | |
53 imers.append(imer+nt) | |
54 kmers = imers | |
55 l += 1 | |
56 | |
57 return kmers | |
58 | |
59 | |
60 def revcomp(seq): | |
61 """get reverse complement DNA sequence | |
62 | |
63 Arguments: | |
64 seq -- string, DNA sequence | |
65 | |
66 Return: | |
67 the reverse complement sequence of the given sequence | |
68 """ | |
69 rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'} | |
70 return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)]) | |
71 | |
72 | |
73 def generate_rcmap_table(kmerlen, kmers): | |
74 """make a lookup table for reverse complement k-mer ids for speed | |
75 | |
76 Arguments: | |
77 kmerlen -- integer, length of k-mer | |
78 kmers -- list, a full set of k-mers generated by generate_kmers | |
79 | |
80 Return: | |
81 a dictionary containing the mapping table | |
82 """ | |
83 revcomp_func = revcomp | |
84 | |
85 kmer_id_dict = {} | |
86 for i in xrange(len(kmers)): | |
87 kmer_id_dict[kmers[i]] = i | |
88 | |
89 revcomp_mapping_table = [] | |
90 for kmerid in xrange(len(kmers)): | |
91 rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])] | |
92 if rc_id < kmerid: | |
93 revcomp_mapping_table.append(rc_id) | |
94 else: | |
95 revcomp_mapping_table.append(kmerid) | |
96 | |
97 return revcomp_mapping_table | |
98 | |
99 | |
100 def read_fastafile(filename, subs=True): | |
101 """Read sequences from a file in FASTA format | |
102 | |
103 Arguments: | |
104 filename -- string, the name of the sequence file in FASTA format | |
105 subs -- bool, substitute 'N' with 'A' if set true | |
106 | |
107 Return: | |
108 list of sequences, list of sequence ids | |
109 """ | |
110 | |
111 sids = [] | |
112 seqs = [] | |
113 | |
114 try: | |
115 f = open(filename, 'r') | |
116 lines = f.readlines() | |
117 f.close() | |
118 | |
119 except IOError, (errno, strerror): | |
120 print "I/O error(%d): %s" % (errno, strerror) | |
121 sys.exit(0) | |
122 | |
123 seq = [] | |
124 for line in lines: | |
125 if line[0] == '>': | |
126 sids.append(line[1:].rstrip('\n').split()[0]) | |
127 if seq != []: seqs.append("".join(seq)) | |
128 seq = [] | |
129 else: | |
130 if subs: | |
131 seq.append(line.rstrip('\n').upper().replace('N', 'A')) | |
132 else: | |
133 seq.append(line.rstrip('\n').upper()) | |
134 | |
135 if seq != []: | |
136 seqs.append("".join(seq)) | |
137 | |
138 return seqs, sids |