annotate lib/seqtools.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 import logging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 logger = logging.getLogger(__name__)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 import itertools
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 import os
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 import sys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 import random
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 import sqlite3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 import subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 import shlex # for command line arguments split
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 from collections import namedtuple
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 from lib.utils import format_query
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 from lib.parallel.parallel import parallel2 as parallel
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14 from lib.parallel.parallel import get_max_proc
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 REQUIRED_VERSION = (3, 4)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 MAX_PRINT = 10
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 if sys.version_info < REQUIRED_VERSION:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 raise Exception("\n\npython 3.4 or higher is required!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21 # additional functions
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 def _hitsort_worker(query, blastdb, output, options):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27 # output from blast is parsed from stdout
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 cmd = options.all2all_search_params.format(query = query, blastdb = blastdb)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 print(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 min_lcov, min_pid, min_ovl, min_scov, evalue_max = options.filtering_threshold
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 pair2insert = ''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 signs ={'plus':'+', 'minus':'-'}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 with open(output, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 for i in p.stdout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 items = i.decode('utf-8').split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 if options.filter_self_hits:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 if items[4] == items[0]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 continue
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 evalue = float(items[10])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 ovl_q = abs(float(items[2]) - float(items[3])) + 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 ovl_h = abs(float(items[6]) - float(items[7])) + 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 if (ovl_q >= min_ovl or ovl_h >= min_ovl) and float(items[8]) >= min_pid:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 if float(items[1]) >= float(items[5]):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 lcov = ovl_q * 100.0 / float(items[1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 scov = ovl_h * 100.0 / float(items[5])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 lcov = ovl_q * 100.0 / float(items[5])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 scov = ovl_h * 100.0 / float(items[1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 # positive line:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 if lcov >= min_lcov and scov >= min_scov and evalue_max > evalue:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 pr = [items[0], items[4]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 # previous HSP
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 if pair2insert != "{0}\t{1}".format(pr[0], pr[1]):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 pair2insert = "{0}\t{1}".format(pr[0], pr[1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 if items[11] in ['plus', 'minus']:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 items[11] = signs[items[11]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 f.write("{0}\t{1}\t{2}\n".format(pair2insert, items[9], "\t".join(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 items[k] for k in [1, 2, 3, 5, 6, 7, 8, 10, 11])))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 def blast_with_filter(fasta_file_filter, blastdb):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 Return list of sequences query id which has similarity to filter
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 It uses mgblast for search
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 params = '-p 85 -W18 -UT -X40 -KT -JF -F "m D" -v100000000 -b100000000 -D4 -C 30 -H 30'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 positive_hits = set()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 min_pid = 90
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 min_ovl_perc = 90
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 cmd = " ".join(['mgblast',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 '-i', fasta_file_filter,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 '-d', blastdb,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73 params
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 ])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 for i in p.stdout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 items = i.decode('utf-8').split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 ovl_perc = (abs(float(items[6]) - float(items[7])) + 1) / float(items[5]) * 100
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 pid = float(items[8])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 if pid > min_pid and ovl_perc > min_ovl_perc:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 positive_hits.add(items[4])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 return(positive_hits)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 # TODO test task
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 # predefine blast params
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 def blast_worker(query, blastdb, output, params):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 if params['program'] in ['blastx', 'blastn']:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 default_columns = ' -outfmt "6 {}"'.format(params['output_columns'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 cmd = "{} -query {} -db {} {} {}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92 params['program'], query, blastdb, params['args'], default_columns)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 elif params['program']=='diamond blastx':
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 # diomand have slightly different format than blastx
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 default_columns = ' --outfmt 6 {}'.format(params['output_columns'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 cmd = "{} -q {} -d {} {} {}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 params['program'], query, blastdb, params['args'], default_columns)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 print(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 preceeding_pair = ['', '']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 BlastValues = namedtuple("BlastValues", params['output_columns'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 with open(output, 'wb') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103 for i in p.stdout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 items = i.decode('utf-8').split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 blast_values = BlastValues(*[f(i) for i, f in zip(items, params['column_types'])])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106 #qseqid, sseqid, qlen, slen, length, ppos, bitscore = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 # f(i) for i, f in zip(items, params['column_types'])]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 if params['filter_function'](blast_values):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 if preceeding_pair != [blast_values.qseqid, blast_values.sseqid]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 f.write(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 preceeding_pair = [blast_values.qseqid, blast_values.sseqid]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 class Sequence:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116 def __init__(self, seq, name, paired=False):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117 # the mode os seq storage can be changed later to make it more
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118 # memory efficient
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119 self._seq = bytes(str(seq), "ascii")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
120 self.name = str(name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
121
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
122 @property
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
123 def seq(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
124 return self._seq.decode("utf-8")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
125
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
126 @seq.setter
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
127 def seq(self, value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
128 self._seq = bytes(str(value), "ascii")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
129
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
130 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
131 return "{0} : {1}".format(self.name, self.seq)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
132
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
133 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
134 def read_fasta(fasta_file_name):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
135 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
136 generator - reads sequences from fasta file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
137 return sequence one by one
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
138 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
139 with open(fasta_file_name, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
140 header = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
141 seqstr = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
142 for rawline in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
143 line = rawline.strip()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
144 if line == "":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
145 continue
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
146 if line[0] == ">":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
147 if header and seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
148 yield Sequence(seqstr, header)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
149 # reset
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
150 seqstr = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
151 header = line[1:]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
152 elif seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
153 Warning("sequence was not preceeded by header")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
154 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
155 header = line[1:]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
156 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
157 seqstr = line if not seqstr else seqstr + line
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
158 # skip empty lines:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
159 if header and seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
160 yield Sequence(seqstr, header)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
161 return
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
162
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
163 def write2fasta(self, file_object):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
164 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
165
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
166
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
167 class SequenceSet:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
168
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
169 def __init__(self, filename=None, source=None, sample_size=0, new=False, paired=False, prefix_length=0, rename=False, seed=123, fasta=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
170 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
171 filename: path to sqlite database, if none database is stored in memory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
172 source: path to fasta file, if none empty database is created
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
173 sample_size: use only sample of sample_size from fasta file, if 0 all is used
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
174 new: should be old database created?
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
175 paired - paired reads
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
176 prefix_length -int
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
177 rename -- use int as names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
178
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
179 creates SequenceSet, either empty or from fasta file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
180 it can be stored as dictionary or in shelve in file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
181 only sample of all sequences can be loaded. if sample_size = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
182 all sequences are loaded
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
183 if source is give, new database is created - if filename exist - it will be deleted!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
184 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
185 #======================================================================
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
186 # TODO name checking
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
187 # TODO detect unique prefixes
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
188 #======================================================================
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
189 # type of storage
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
190 random.seed(seed)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
191
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
192 self.source = source
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
193 self.sample_size = sample_size
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
194 self.paired = paired
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
195 self.filename = filename
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
196 self.prefix_length = prefix_length
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
197 self.prefix_codes = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
198 self.rename = rename
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
199 self.fasta = fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
200 self._length = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
201 self._original_length = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
202 self.blastdb = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
203 self.blastdb_legacy = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
204 self.chunks = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
205 self.ids_kept = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
206 self.annotations = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
207 self.hitsort = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
208 if filename:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
209 logger.debug("Creating database in file")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
210 if os.path.isfile(filename) and (new or source):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
211 # remove old database if exists
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
212 os.remove(filename)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
213 # sqlite database in file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
214 self.conn = sqlite3.connect(filename)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
215 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
216 # sqlite database in memory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
217 logger.debug("Creating database in memory")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
218 self.conn = sqlite3.connect(":memory:")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
219 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
220 c.execute("create table sequences (name text, sequence text)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
221
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
222 if source:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
223 self._read_from_fasta()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
224 c.execute("CREATE TABLE prefix_codes (prefix TEXT, N INTEGER)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
225 self.conn.executemany("INSERT INTO prefix_codes (prefix, N) values (?,?)", self.prefix_codes.items())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
226 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
227 self._commit_immediately = True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
228 self.makeblastdb(legacy=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
229 # store some attribudes in database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
230 self._update_database()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
231 logger.info("sequences loaded")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
232 logger.info(self)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
233
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
234 def _update_database(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
235 # pass all atributes - which are either float, ind and str to extra table
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
236 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
237 stored_attributes = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
238 ("sample_size", "integer"),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
239 ("_length", "integer"),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
240 ("_original_length", "integer"),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
241 ("paired", "integer"),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
242 ]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
243 columns = ", ".join(i[0] + " " + i[1] for i in stored_attributes)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
244 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
245 c.execute((
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
246 "CREATE TABLE seqinfo ( {} )".format(columns)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
247 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
248 except sqlite3.OperationalError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
249 pass # table already exists
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
250 # get current values
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
251 values = tuple(getattr(self, i[0]) for i in stored_attributes)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
252 placeholder = ", ".join(["?"] * len(values))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
253 c.execute("DELETE FROM seqinfo")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
254 c.execute("INSERT INTO seqinfo VALUES ({})".format(placeholder), values)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
255
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
256
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
257 def _read_from_fasta(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
258 # changed will be commited after whole file is loaded - this is faster
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
259 self._commit_immediately = False
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
260 if self.fasta:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
261 f = open(self.fasta, mode='w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
262 # define sampling
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
263 if self.sample_size:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
264 logger.debug(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
265 'sampling sequences: {} sample size'.format(self.sample_size))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
266 N = self.fasta_length(self.source)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
267 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
268 s = itertools.chain(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
269 [[i, i + 1] for i in sorted(random.sample(range(1, N, 2), int(self.sample_size / 2)))])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
270
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
271 sample = itertools.chain(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
272 [item for sublist in s for item in sublist])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
273
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
274 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
275 sample = itertools.chain(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
276 sorted(random.sample(range(1, N + 1), self.sample_size)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
277 logger.debug("sampling unpaired reads")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
278 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
279 # no sampling - use all
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
280 sample = itertools.count(1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
281
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
282 # define counter:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
283 if self.rename:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
284 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
285 counter = itertools.count(1, 0.5)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
286 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
287 counter = itertools.count(1, 1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
288 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
289 counter = itertools.cycle([""])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
290 # define pairs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
291 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
292 pair = itertools.cycle(['f', 'r'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
293 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
294 pair = itertools.cycle([''])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
295 position = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
296
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
297 seq2write = next(sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
298
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
299 for i in Sequence.read_fasta(self.source):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
300 position += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
301 if position == seq2write:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
302
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
303 # create name:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
304 prefix = i.name[0:self.prefix_length] # could be empty ""
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
305 if self.rename:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
306 i.name = "{0}{1}{2}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
307 prefix,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
308 int(next(counter)),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
309 next(pair)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
310 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
311 if self.prefix_length:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
312 if prefix in self.prefix_codes:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
313 self.prefix_codes[prefix] += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
314 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
315 self.prefix_codes[prefix] = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
316 self[i.name] = i.seq
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
317 if self.fasta:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
318 i.write2fasta(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
319 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
320 seq2write = next(sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
321 except StopIteration:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
322 # no more sequences to sample
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
323 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
324
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
325
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
326 self.conn.commit() # save changes
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
327 if self.fasta:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
328 f.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
329 self._commit_immediately = True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
330
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
331 def __getitem__(self, item):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
332 # item cannot be empty!!!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
333 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
334 c.execute("select * from sequences where name= '{}'".format(item))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
335 # try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
336 name, sequence = c.fetchone() # should be only one
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
337 # except TypeError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
338 # return None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
339 return sequence
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
340
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
341 def __setitem__(self, item, value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
342 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
343 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
344 "insert into sequences values ( '{0}', '{1}')".format(item, value))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
345 if self._commit_immediately:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
346 self.conn.commit() # save changes
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
347
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
348 # def __iter__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
349 # self.c.execute("select name from sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
350 # for i in self.c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
351 # yield i[0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
352
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
353 def __iter__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
354 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
355 c.execute("select name from sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
356 for i in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
357 yield i[0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
358
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
359
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
360 # def __iter__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
361 # for i in range(1,len(self)):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
362 # yield i
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
363 def items(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
364 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
365 c.execute("select name, sequence from sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
366 for name, seq in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
367 yield Sequence(seq, name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
368
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
369 def toDict(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
370 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
371 c.execute("select name, sequence from sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
372 d = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
373 for name, seq in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
374 d[name]=seq
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
375 return(d)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
376
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
377 def __len__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
378 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
379 c.execute("select count(*) from sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
380 length = c.fetchone()[0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
381 return(length)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
382
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
383 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
384 out = ""
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
385 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
386 c.execute("select * from sequences limit {0}".format(MAX_PRINT))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
387 for n, s in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
388 out = out + "{0} : {1}\n".format(n, s)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
389 out = out + "...\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
390 out = out + str(len(self)) + " sequence total\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
391 return out
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
392
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
393 def insert(self, sequence, commit=True):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
394 self._commit_immediately = commit
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
395 self[sequence.name] = sequence.seq
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
396 self._commit_immediately = True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
397
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
398 def keys(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
399 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
400 this will get names of sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
401 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
402 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
403 c.execute('select name from sequences order by rowid')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
404 names = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
405 for i in c.fetchall():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
406 names.append(i[0])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
407 return names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
408
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
409 def sample(self, size, seed=123):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
410 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
411 generator - reproducible sampling sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
412 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
413 max_index = len(self)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
414 # sample = seqtools.SequenceSet(source=info.input_sequences, )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
415 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
416 size = int(size / 2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
417 step = 2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
418 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
419 step = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
420 random.seed(seed)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
421 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
422 for i in sorted(random.sample(range(1, max_index, step), size)):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
423 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
424 "select name, sequence from sequences where rowid={}".format(i))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
425 name, sequence = c.fetchone()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
426 yield Sequence(sequence, name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
427 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
428 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
429 "select name, sequence from sequences where rowid={}".format(i + 1))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
430 name, sequence = c.fetchone()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
431 yield Sequence(sequence, name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
432
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
433 def sample2db(self, db_file=None, fasta_file_name=None, size=100, seed=123):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
434 if (not db_file) and (not fasta_file_name):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
435 raise IOError("no db_file nor fasta_file_name were defined")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
436 # open files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
437 if db_file:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
438 db = SequenceSet(filename=db_file, source=None, new=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
439 if fasta_file_name:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
440 f = open(fasta_file_name, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
441 # write in files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
442 for i in self.sample(size, seed):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
443 if db_file:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
444 db.insert(i, commit=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
445 if fasta_file_name:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
446 i.write2fasta(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
447 # close files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
448 if fasta_file_name:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
449 f.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
450 if db_file:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
451 db.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
452 return db
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
453
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
454 def save2fasta(self, fasta_file_name, keep=True, subset=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
455 if subset:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
456 with open(fasta_file_name, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
457 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
458 c.execute("select name, sequence from sequences where name in ('{}')".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
459 "','".join(subset)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
460 for name, sequence in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
461 s = Sequence(sequence, name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
462 s.write2fasta(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
463 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
464 with open(fasta_file_name, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
465 for i in self.items():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
466 i.write2fasta(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
467 if keep:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
468 self.fasta = fasta_file_name
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
469
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
470 def save_annotation(self, annotation_name, subset, dir):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
471 annotation_file = "{}/{}_annotation.csv".format(dir, annotation_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
472 with open(annotation_file, "w") as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
473 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
474 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
475 "select * from {} where name in ('{}')".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
476 annotation_name, "','".join(subset)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
477 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
478 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
479 header = [description[0] for description in c.description]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
480 f.write("\t".join(str(j) for j in header) + "\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
481 for i in c:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
482 f.write("\t".join(str(j) for j in i) + "\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
483 return annotation_file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
484
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
485 def make_chunks(self, file_name=None, chunk_size=5000):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
486 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
487 split SequneceSet to chunks and save to files,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
488 return list of files names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
489 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
490 logger.debug("creating chunks from fasta file: ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
491 if not file_name and self.fasta:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
492 file_name = self.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
493 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
494 raise Exception('file name for chunks is not defined!')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
495
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
496 seqs = iter(self.items())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
497 fn_chunk_names = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
498 for i in itertools.count():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
499 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
500 fn = "{0}.{1}".format(file_name, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
501 logger.info("saving chunk {}".format(fn))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
502 for n in range(chunk_size):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
503 s = next(seqs)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
504 if not n:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
505 f = open(fn, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
506 s.write2fasta(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
507 f.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
508 fn_chunk_names.append(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
509 except StopIteration:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
510 if not f.closed:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
511 f.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
512 fn_chunk_names.append(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
513 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
514 self.chunks = fn_chunk_names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
515 logger.debug(self.chunks)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
516 return(fn_chunk_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
517
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
518 def makeblastdb(self, blastdb=None, dbtype='nucl', legacy=False, lastdb = False):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
519 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
520 dbtype eithe 'nucl' of 'prot'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
521 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
522 logger.info("creating blast database")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
523 # check if fasta file on disk is present
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
524 if not self.fasta:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
525 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
526 self.save2fasta(blastdb)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
527 except TypeError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
528 print("\nEither blastdb or fasta must be ",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
529 "defined when making blast database!!!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
530 raise
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
531 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
532 blastdb = blastdb if blastdb else self.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
533
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
534 cmd = "makeblastdb -in {0} -out {1} -dbtype {2}".format(self.fasta,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
535 blastdb,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
536 dbtype)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
537 args = shlex.split(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
538 if 0 == subprocess.check_call(args, stderr=sys.stdout):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
539 self.blastdb = blastdb
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
540 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
541 Warning("makeblastdb failed")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
542
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
543 if legacy:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
544 logger.info("creating legacy blast database")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
545 prot = {'nucl': 'F', 'prot': 'T'}[dbtype]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
546 cmd = "formatdb -i {0} -p {1} -n {2}.legacy".format(self.fasta,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
547 prot,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
548 blastdb
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
549 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
550 args = shlex.split(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
551 if subprocess.check_call(args) == 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
552 self.blastdb_legacy = "{}.legacy".format(blastdb)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
553 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
554 Warning("formatdb failed")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
555 if lastdb:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
556 logger.info("creating lastdb database")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
557 cmd = "lastdb -u NEAR {fasta} {fasta}".format(fasta=self.fasta)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
558 args = shlex.split(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
559 if subprocess.check_call(args) == 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
560 self.lastdb = self.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
561 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
562 Warning("formatdb failed")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
563
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
564
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
565 def create_hitsort(self, options, output=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
566 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
567 query is name of files to search against blastdb of SequenceSet object
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
568 query could be multiple files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
569 this is inteded to be used for tabular output only
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
570 output file must be specified,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
571 if more than one query is used, multiple files are created
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
572 worker - function to execute blast search
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
573 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
574
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
575 logger.info('running all to all blast')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
576 query = self.chunks
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
577 if not output:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
578 output = "{}.blast".format(self.fasta)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
579 database = getattr(self, options.database)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
580 if len(query) > 1:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
581 output_parts = [output + "." + str(i) for i in range(len(query))]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
582 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
583 output_parts = [output]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
584 query = [query]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
585
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
586 parallel(_hitsort_worker, query, [database],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
587 output_parts, [options])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
588 logger.info('all to all blast finished')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
589 logger.info('removing duplicates from all to all blast results')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
590 self._clean_hitsort(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
591 hitsort_files=output_parts, filtered_hitsort=output)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
592 # remove temp files:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
593 for f in output_parts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
594 os.unlink(f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
595 self.hitsort = output
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
596 return output
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
597
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
598 def _clean_hitsort(self, hitsort_files, filtered_hitsort):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
599 ''' alterantive hitsort duplicate removal '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
600
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
601 ids = self.keys()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
602 Nfiles = 5 + int(sum([os.path.getsize(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
603 for i in hitsort_files]) / 5e9)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
604 hitsort_parts = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
605 "{0}.{1}.parts".format(filtered_hitsort, i) for i in range(Nfiles)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
606 # open files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
607 fs = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
608 for i in hitsort_parts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
609 fs.append(open(i, 'w'))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
610
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
611 ids_destination = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
612 fs_iter = itertools.cycle(fs)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
613 for i in ids:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
614 #ids_destination[i] = fs_iter.next()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
615 ids_destination[i] = next(fs_iter)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
616 lines_out = lines_total = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
617 temp_open = False
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
618
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
619 for i in hitsort_files:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
620 f = open(i, 'r')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
621 while True:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
622 line = f.readline()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
623 if not line:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
624 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
625 lines_total += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
626 line_items = line.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
627 # ids_destination[line_items[0]].write(line)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
628 # do not assume that it is sorted!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
629 ids_destination[min(line_items[0:2])].write(line)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
630 # close all files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
631 for i in fs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
632 i.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
633
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
634 # load one by one and exclude duplicates:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
635 hit_out = open(filtered_hitsort, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
636 for i in hitsort_parts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
637 f = open(i, 'r')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
638 hitsort_shelve = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
639 while True:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
640 line = f.readline()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
641 if not line:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
642 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
643 line_items = line.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
644 key = "\t".join(sorted(line_items[0:2]))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
645 value = line_items
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
646 bitscore = float(line_items[2])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
647 if key in hitsort_shelve:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
648 if float(hitsort_shelve[key][2]) > bitscore:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
649 hitsort_shelve[key] = value
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
650 hit_out.write("\t".join(hitsort_shelve[key]) + "\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
651 del hitsort_shelve[key]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
652 lines_out += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
653 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
654 hitsort_shelve[key] = value
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
655 f.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
656 for key in hitsort_shelve:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
657 hit_out.write("\t".join(hitsort_shelve[key]) + "\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
658 lines_out += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
659 # clean up
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
660 for i in hitsort_parts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
661 os.unlink(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
662 hit_out.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
663 return lines_total, lines_out
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
664
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
665 def _check_database(self, database):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
666 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
667 database is path to fastafile, database with same prefix should exist
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
668 it creates database if does not exist as temporal file!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
669 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
670 pass
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
671 # TODO
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
672
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
673 def remove_sequences_using_filter(self, fasta_file_filter, keep_proportion,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
674 omitted_sequences_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
675 kept_sequences_file):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
676 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
677 Run blast with fasta_file_filter against legaci blastdb of SequenceSet to detect sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
678 which are to be removed from SequencesSet. Complete pairsare removed if paired sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
679 are used.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
680 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
681 ids = blast_with_filter(fasta_file_filter, self.blastdb_legacy)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
682 # check against database and remove sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
683
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
684 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
685 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
686 c.execute("SELECT rowid FROM sequences WHERE name IN {}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
687 format_query(ids)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
688 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
689 odd_rowids = set()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
690 for i in c.fetchall():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
691 if i[0] % 2 == 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
692 odd_rowids.add(i[0] - 1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
693 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
694 odd_rowids.add(i[0])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
695 odd_rowids_keep = random.sample(odd_rowids, round(keep_proportion * len(odd_rowids)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
696 c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
697 format_query(odd_rowids_keep + [i+1 for i in odd_rowids_keep])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
698 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
699 self.ids_kept = [i[0] for i in c.fetchall()]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
700 odd_rowids_omit = list(odd_rowids.difference(odd_rowids_keep))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
701 c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
702 format_query(odd_rowids_omit + [i+1 for i in odd_rowids_omit])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
703 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
704 ids_omit = [i[0] for i in c.fetchall()]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
705
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
706 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
707 self.ids_kept = random.sample(ids, round(keep_proportion * len(ids)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
708 ids_omit = list(ids.difference(self.ids_kept))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
709 self.save2fasta(omitted_sequences_file, keep=False, subset=ids_omit)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
710 self.save2fasta(kept_sequences_file, keep=False, subset=self.ids_kept)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
711 self.drop_sequences(ids_omit)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
712 # save info about omited sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
713 c.execute("CREATE TABLE ids_kept (name TEXT)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
714 c.executemany("INSERT INTO ids_kept (name) values (?)", [(i,) for i in self.ids_kept])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
715 return len(ids_omit)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
716
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
717 def drop_sequences(self, ids_to_drop):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
718 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
719 remove sequences from sequence set based on the ID
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
720 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
721 self._original_length = len(self)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
722 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
723 idslist = format_query(ids_to_drop)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
724 c.execute("CREATE TABLE omited_sequences AS SELECT * FROM sequences WHERE name IN {}".format(idslist))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
725 c.execute("DELETE FROM sequences WHERE name IN {}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
726 idslist
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
727 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
728 # new databases must be created - with ommited sequences!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
729 self.save2fasta(fasta_file_name=self.fasta) ## this will replace origninal file!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
730 self._length = len(self)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
731 self.makeblastdb(legacy=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
732 self._update_database()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
733
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
734 def annotate(self, database, annotation_name, directory="", params=""):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
735 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
736 database : path to blast formated database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
737 method: type of search to use for annotation. it must be
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
738 'blastn' of 'blastx'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
739 Annotation is save in directory also into the table with name stored in
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
740 annotation_name variable
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
741 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
742 logger.info("annotating reads with {} ".format(annotation_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
743 self._check_database(database)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
744 if "parallelize" in params:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
745 parallelize = params['parallelize']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
746 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
747 parallelize = True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
748 if parallelize:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
749 if not self.chunks:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
750 self.make_chunks()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
751 output = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
752 "{}/{}_{}".format(directory, annotation_name,os.path.basename(i)) for i in self.chunks]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
753 parallel(blast_worker, self.chunks, [database], output, [params])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
754 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
755 # no explicit parallelization using parallel
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
756 single_output = "{}/{}_hits".format(directory, annotation_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
757 params['args'] = params['args'].format(max_proc=get_max_proc())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
758 blast_worker(self.fasta, database, single_output,params)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
759 output = [single_output]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
760
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
761 # put into as new attribute and sqlite table
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
762 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
763 c.execute("create table {} (name text, db_id text, length real, bitscore real , pid real)".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
764 annotation_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
765 for blast_results in output:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
766 with open(blast_results, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
767 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
768 items = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
769 types = [str, str, float, float, float, float, float]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
770 qseqid, sseqid, qlen, slen, length, pident, bitscore = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
771 f(i) for i, f in zip(items, types)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
772 c.execute('insert into {} values ("{}", "{}", "{}", "{}", "{}")'.format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
773 annotation_name, qseqid, sseqid, length, bitscore, pident))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
774 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
775 self.annotations.append(annotation_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
776
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
777 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
778 def fasta_length(fasta_file_name):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
779 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
780 count number of sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
781 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
782 with open(fasta_file_name, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
783 N = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
784 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
785 if i.strip()[0] == ">":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
786 N += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
787 return N
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
788
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
789
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
790 # test
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
791 if __name__ == "__main__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
792 # get root directory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
793
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
794 MAIN_DIR = os.path.realpath(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
795 os.path.dirname(os.path.realpath(__file__)) + "/..")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
796 print("MAIN_DIR:")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
797 print(MAIN_DIR)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
798 TMP_DIR = "{}/tmp".format(MAIN_DIR)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
799 TEST_DATA_DIR = "{}/test_data".format(MAIN_DIR)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
800
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
801 # some data:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
802 fn = "{}/pair_reads.fasta".format(TEST_DATA_DIR)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
803 os.chdir(TMP_DIR)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
804 # s = SequenceSet(filename='sqlite.db')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
805 print("number of sequences in fasta file:")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
806 print(SequenceSet.fasta_length(fn))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
807
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
808 print("loading sequences...")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
809 s = SequenceSet(source=fn, paired=True, rename=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
810 filename='sqlite.db', prefix_length=4, fasta='sample2.fasta')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
811 print(s)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
812
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
813 print("creating blast database")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
814 s.makeblastdb()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
815
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
816 print("creating chunks from fasta file: ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
817 file_list = s.make_chunks("fasta_chunks", chunk_size=500)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
818 print(file_list)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
819
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
820 print('creating hitsort')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
821 s.create_hitsort(file_list, "hitsort")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
822
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
823 print("saving to fasta file")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
824 s.save2fasta('sampleX.fasta', keep=False)