annotate lib/graphtools.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 This module is mainly for large graph (e.i hitsort) storage, parsing and for clustering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 import os
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 import sys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 import sqlite3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 import time
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 import subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 import logging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 from collections import defaultdict
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 import collections
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 import operator
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14 import math
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 import random
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 import itertools
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 import config
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 from lib import r2py
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 from lib.utils import FilePath
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 from lib.parallel.parallel import parallel2 as parallel
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21 REQUIRED_VERSION = (3, 4)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 MAX_BUFFER_SIZE = 100000
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 if sys.version_info < REQUIRED_VERSION:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 raise Exception("\n\npython 3.4 or higher is required!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 LOGGER = logging.getLogger(__name__)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 def dfs(start, graph):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 """
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 helper function for cluster merging.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 Does depth-first search, returning a set of all nodes seen.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 Takes: a graph in node --> [neighbors] form.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 """
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 visited, worklist = set(), [start]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 while worklist:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 node = worklist.pop()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 if node not in visited:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 visited.add(node)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 # Add all the neighbors to the worklist.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 worklist.extend(graph[node])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 return visited
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 def graph_components(edges):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 """
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 Given a graph as a list of edges, divide the nodes into components.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 Takes a list of pairs of nodes, where the nodes are integers.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 """
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 # Construct a graph (mapping node --> [neighbors]) from the edges.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 graph = defaultdict(list)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 nodes = set()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 for v1, v2 in edges:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 nodes.add(v1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 nodes.add(v2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 graph[v1].append(v2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 graph[v2].append(v1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 # Traverse the graph to find the components.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 components = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 # We don't care what order we see the nodes in.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 while nodes:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 component = dfs(nodes.pop(), graph)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 components.append(component)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 # Remove this component from the nodes under consideration.
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 nodes -= component
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 return components
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 class Graph():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 create Graph object stored in sqlite database, either in memory or on disk
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 structure of table is:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 V1 V2 weigth12
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 V2 V3 weight23
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83 V4 V5 weight45
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 ...
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 ...
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 !! this is undirected simple graph - duplicated edges must
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 be removed on graph creation
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 # seed for random number generator - this is necessary for reproducibility between runs
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 seed = '123'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 def __init__(self,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 source=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 filename=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 new=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 paired=True,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 seqids=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 filename : fite where to store database, if not defined it is stored in memory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 source : ncol file from which describe graph
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 new : if false and source is not define graph can be loaded from database (filename)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 vertices_name must be in correcti order!!!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 self.filename = filename
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 self.source = source
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 self.paired = paired
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 # path to indexed graph - will be set later
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 self.indexed_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112 self._cluster_list = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113 # these two attributes are set after clustering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 # communities before merging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115 self.graph_2_community0 = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116 # communities after merging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117 self.graph_2_community = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118 self.number_of_clusters = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119 self.binary_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
120 self.cluster_sizes = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
121 self.graph_tree = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
122 self.graph_tree_log = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
123 self.weights_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
124
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
125 if filename:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
126 if os.path.isfile(filename) and (new or source):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
127 os.remove(filename)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
128 self.conn = sqlite3.connect(filename)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
129 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
130 self.conn = sqlite3.connect(":memory:")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
131 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
132
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
133 c.execute("PRAGMA page_size=8192")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
134 c.execute("PRAGMA cache_size = 2000000 ") # this helps
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
135
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
136 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
137 c.execute((
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
138 "create table graph (v1 integer, v2 integer, weight integer, "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
139 "pair integer, v1length integer, v1start integer, v1end integer, "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
140 "v2length integer, v2start integer, v2end integer, pid integer,"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
141 "evalue real, strand text )"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
142 except sqlite3.OperationalError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
143 pass # table already exist
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
144 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
145 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
146 "create table vertices (vertexname text primary key, vertexindex integer)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
147 tables = sorted(c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
148 "SELECT name FROM sqlite_master WHERE type='table'").fetchall())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
149
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
150 if not [('graph', ), ('vertices', )] == tables:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
151 raise Exception("tables for sqlite for graph are not correct")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
152
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
153 if source:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
154 self._read_from_hitsort()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
155
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
156 if paired and seqids:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
157 # vertices must be defined - create graph of paired reads:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
158 # last character must disinguish pair
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
159 c.execute((
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
160 "create table pairs (basename, vertexname1, vertexname2,"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
161 "v1 integer, v2 integer, cluster1 integer, cluster2 integer)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
162 buffer = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
163 for i, k in zip(seqids[0::2], seqids[1::2]):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
164 assert i[:-1] == k[:-1], "problem with pair reads ids"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
165 # some vertices are not in graph - singletons
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
166 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
167 index1 = self.vertices[i]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
168 except KeyError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
169 index1 = -1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
170
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
171 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
172 index2 = self.vertices[k]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
173 except KeyError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
174 index2 = -1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
175
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
176 buffer.append((i[:-1], i, k, index1, index2))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
177
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
178 self.conn.executemany(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
179 "insert into pairs (basename, vertexname1, vertexname2, v1, v2) values (?,?,?,?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
180 buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
181 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
182
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
183 def _read_from_hitsort(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
184
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
185 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
186 c.execute("delete from graph")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
187 buffer = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
188 vertices = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
189 counter = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
190 v_count = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
191 with open(self.source, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
192 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
193 edge_index = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
194 items = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
195 # get or insert vertex index
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
196 for vn in items[0:2]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
197 if vn not in vertices:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
198 vertices[vn] = v_count
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
199 edge_index[vn] = v_count
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
200 v_count += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
201 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
202 edge_index[vn] = vertices[vn]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
203 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
204 pair = int(items[0][:-1] == items[1][:-1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
205 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
206 pair = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
207 buffer.append(((edge_index[items[0]], edge_index[items[1]],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
208 items[2], pair) + tuple(items[3:])))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
209 if len(buffer) == MAX_BUFFER_SIZE:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
210 counter += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
211 self.conn.executemany(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
212 "insert or ignore into graph values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
213 buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
214 buffer = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
215 if buffer:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
216 self.conn.executemany(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
217 "insert or ignore into graph values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
218 buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
219
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
220 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
221 self.vertices = vertices
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
222 self.vertexid2name = {
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
223 vertex: index
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
224 for index, vertex in vertices.items()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
225 }
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
226 self.vcount = len(vertices)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
227 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
228 c.execute("select count(*) from graph")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
229 self.ecount = c.fetchone()[0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
230 # fill table of vertices
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
231 self.conn.executemany("insert into vertices values (?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
232 vertices.items())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
233 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
234
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
235 def save_indexed_graph(self, file=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
236 if not file:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
237 self.indexed_file = "{}.int".format(self.source)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
238 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
239 self.indexed_file = file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
240 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
241 with open(self.indexed_file, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
242 out = c.execute('select v1,v2,weight from graph')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
243 for v1, v2, weight in out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
244 f.write('{}\t{}\t{}\n'.format(v1, v2, weight))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
245
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
246 def get_subgraph(self, vertices):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
247 pass
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
248
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
249 def _levels(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
250 with open(self.graph_tree_log, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
251 levels = -1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
252 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
253 if i[:5] == 'level':
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
254 levels += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
255 return levels
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
256
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
257 def _reindex_community(self, id2com):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
258 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
259 reindex community and superclusters so that biggest cluster is no.1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
260 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
261 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
262 _, community, supercluster = zip(*id2com)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
263 (cluster_index, frq, self.cluster_sizes,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
264 self.number_of_clusters) = self._get_index_and_frequency(community)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
265
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
266 supercluster_index, sc_frq, _, _ = self._get_index_and_frequency(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
267 supercluster)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
268 id2com_reindexed = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
269
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
270 for i, _ in enumerate(id2com):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
271 id2com_reindexed.append((id2com[i][0], id2com[i][1], frq[
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
272 i], cluster_index[i], supercluster_index[i], sc_frq[i]))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
273 return id2com_reindexed
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
274
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
275 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
276 def _get_index_and_frequency(membership):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
277 frequency_table = collections.Counter(membership)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
278 frequency_table_sorted = sorted(frequency_table.items(),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
279 key=operator.itemgetter(1),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
280 reverse=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
281 frq = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
282 for i in membership:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
283 frq.append(frequency_table[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
284 rank = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
285 index = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
286 for comm, _ in frequency_table_sorted:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
287 index += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
288 rank[comm] = index
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
289 cluster_index = [rank[i] for i in membership]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
290 cluster_sizes = [i[1] for i in frequency_table_sorted]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
291 number_of_clusters = len(frequency_table)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
292 return [cluster_index, frq, cluster_sizes, number_of_clusters]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
293
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
294 def louvain_clustering(self, merge_threshold=0, cleanup=False):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
295 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
296 input - graph
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
297 output - list of clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
298 executables path ??
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
299 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
300 LOGGER.info("converting hitsort to binary format")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
301 self.binary_file = "{}.bin".format(self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
302 self.weights_file = "{}.weight".format(self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
303 self.graph_tree = "{}.graph_tree".format(self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
304 self.graph_tree_log = "{}.graph_tree_log".format(self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
305 self.graph_2_community0 = "{}.graph_2_community0".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
306 self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
307 self._cluster_list = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
308 self.graph_2_community = "{}.graph_2_community".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
309 self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
310 print(["louvain_convert", "-i", self.indexed_file, "-o",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
311 self.binary_file, "-w", self.weights_file])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
312 subprocess.check_call(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
313 ["louvain_convert", "-i", self.indexed_file, "-o",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
314 self.binary_file, "-w", self.weights_file],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
315 timeout=None)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
316
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
317 gt = open(self.graph_tree, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
318 gtl = open(self.graph_tree_log, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
319 LOGGER.info("running louvain clustering...")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
320 subprocess.check_call(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
321 ["louvain_community", self.binary_file, "-l", "-1", "-w",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
322 self.weights_file, "-v ", "-s", self.seed],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
323 stdout=gt,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
324 stderr=gtl,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
325 timeout=None)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
326 gt.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
327 gtl.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
328
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
329 LOGGER.info("creating list of cummunities")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
330 gt2c = open(self.graph_2_community0, 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
331 subprocess.check_call(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
332 ['louvain_hierarchy', self.graph_tree, "-l", str(self._levels())],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
333 stdout=gt2c)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
334 gt2c.close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
335 if merge_threshold and self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
336 com2newcom = self.find_superclusters(merge_threshold)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
337 elif self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
338 com2newcom = self.find_superclusters(config.SUPERCLUSTER_THRESHOLD)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
339 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
340 com2newcom = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
341 # merging of clusters, creatting superclusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
342 LOGGER.info("mergings clusters based on mate-pairs ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
343 # modify self.graph_2_community file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
344 # rewrite graph2community
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
345 with open(self.graph_2_community0, 'r') as fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
346 with open(self.graph_2_community, 'w') as fout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
347 for i in fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
348 # write graph 2 community file in format:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
349 # id communityid supeclusterid
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
350 # if merging - community and superclustwers are identical
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
351 vi, com = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
352 if merge_threshold:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
353 ## mergin
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
354 if int(com) in com2newcom:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
355 fout.write("{} {} {}\n".format(vi, com2newcom[int(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
356 com)], com2newcom[int(com)]))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
357 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
358 fout.write("{} {} {}\n".format(vi, com, com))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
359 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
360 ## superclusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
361 if int(com) in com2newcom:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
362 fout.write("{} {} {}\n".format(vi, com, com2newcom[
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
363 int(com)]))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
364 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
365 fout.write("{} {} {}\n".format(vi, com, com))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
366
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
367 LOGGER.info("loading communities into database")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
368 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
369 c.execute(("create table communities (vertexindex integer primary key,"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
370 "community integer, size integer, cluster integer, "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
371 "supercluster integer, supercluster_size integer)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
372 id2com = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
373 with open(self.graph_2_community, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
374 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
375 name, com, supercluster = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
376 id2com.append((name, com, supercluster))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
377 id2com_reindexed = self._reindex_community(id2com)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
378 c.executemany("insert into communities values (?,?,?,?,?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
379 id2com_reindexed)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
380 #create table of superclusters - clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
381 c.execute(("create table superclusters as "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
382 "select distinct supercluster, supercluster_size, "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
383 "cluster, size from communities;"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
384 # create view id-index-cluster
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
385 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
386 ("CREATE VIEW vertex_cluster AS SELECT vertices.vertexname,"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
387 "vertices.vertexindex, communities.cluster, communities.size"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
388 " FROM vertices JOIN communities USING (vertexindex)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
389 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
390
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
391 # add clustering infor to graph
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
392 LOGGER.info("updating graph table")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
393 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
394
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
395 c.execute("alter table graph add c1 integer")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
396 c.execute("alter table graph add c2 integer")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
397 c.execute(("update graph set c1 = (select cluster FROM communities "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
398 "where communities.vertexindex=graph.v1)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
399 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
400 ("update graph set c2 = (select cluster FROM communities where "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
401 "communities.vertexindex=graph.v2)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
402 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
403 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
404 LOGGER.info("updating graph table - done in {} seconds".format(t1 -
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
405 t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
406
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
407 # identify similarity connections between clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
408 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
409 "create table cluster_connections as SELECT c1,c2 , count(*) FROM (SELECT c1, c2 FROM graph WHERE c1>c2 UNION ALL SELECT c2 as c1, c1 as c2 FROM graph WHERE c2>c1) GROUP BY c1, c2")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
410 # TODO - remove directionality - summarize -
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
411
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
412 # add cluster identity to pairs table
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
413
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
414 if self.paired:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
415 LOGGER.info("analyzing pairs ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
416 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
417 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
418 "UPDATE pairs SET cluster1=(SELECT cluster FROM communities WHERE communities.vertexindex=pairs.v1)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
419 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
420 LOGGER.info(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
421 "updating pairs table - cluster1 - done in {} seconds".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
422 t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
423
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
424 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
425 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
426 "UPDATE pairs SET cluster2=(SELECT cluster FROM communities WHERE communities.vertexindex=pairs.v2)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
427 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
428 LOGGER.info(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
429 "updating pairs table - cluster2 - done in {} seconds".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
430 t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
431 # reorder records
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
432
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
433 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
434 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
435 "UPDATE pairs SET cluster1=cluster2, cluster2=cluster1, vertexname1=vertexname2,vertexname2=vertexname1 where cluster1<cluster2")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
436 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
437 LOGGER.info("sorting - done in {} seconds".format(t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
438
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
439 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
440 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
441 "create table cluster_mate_connections as select cluster1 as c1, cluster2 as c2, count(*) as N, group_concat(basename) as ids from pairs where cluster1!=cluster2 group by cluster1, cluster2;")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
442 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
443 LOGGER.info(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
444 "creating cluster_mate_connections table - done in {} seconds".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
445 t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
446 # summarize
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
447 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
448 self._calculate_pair_bond()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
449 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
450 LOGGER.info(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
451 "calculating cluster pair bond - done in {} seconds".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
452 t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
453 t0 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
454 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
455 # not paired - create empty tables
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
456 self._add_empty_tables()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
457
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
458 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
459 t1 = time.time()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
460 LOGGER.info("commiting changes - done in {} seconds".format(t1 - t0))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
461
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
462 if cleanup:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
463 LOGGER.info("cleaning clustering temp files")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
464 os.unlink(self.binary_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
465 os.unlink(self.weights_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
466 os.unlink(self.graph_tree)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
467 os.unlink(self.graph_tree_log)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
468 os.unlink(self.graph_2_community0)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
469 os.unlink(self.graph_2_community)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
470 os.unlink(self.indexed_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
471 self.binary_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
472 self.weights_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
473 self.graph_tree = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
474 self.graph_tree_log = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
475 self.graph_2_community0 = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
476 self.graph_2_community = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
477 self.indexed_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
478
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
479 # calcultate k
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
480
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
481 def find_superclusters(self, merge_threshold):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
482 '''Find superclusters from clustering based on paired reads '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
483 clsdict = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
484 with open(self.graph_2_community0, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
485 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
486 vi, com = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
487 if com in clsdict:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
488 clsdict[com] += [self.vertexid2name[int(vi)][0:-1]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
489 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
490 clsdict[com] = [self.vertexid2name[int(vi)][0:-1]]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
491 # remove all small clusters - these will not be merged:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
492 small_cls = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
493 for i in clsdict:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
494 if len(clsdict[i]) < config.MINIMUM_NUMBER_OF_READS_FOR_MERGING:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
495 small_cls.append(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
496 for i in small_cls:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
497 del clsdict[i]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
498 pairs = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
499 for i, j in itertools.combinations(clsdict, 2):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
500 s1 = set(clsdict[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
501 s2 = set(clsdict[j])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
502 wgh = len(s1 & s2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
503 if wgh < config.MINIMUM_NUMBER_OF_SHARED_PAIRS_FOR_MERGING:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
504 continue
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
505 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
506 n1 = len(s1) * 2 - len(clsdict[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
507 n2 = len(s2) * 2 - len(clsdict[j])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
508 k = 2 * wgh / (n1 + n2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
509 if k > merge_threshold:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
510 pairs.append((int(i), int(j)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
511 # find connected commponents - will be merged
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
512 cls2merge = graph_components(pairs)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
513 com2newcom = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
514 for i in cls2merge:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
515 newcom = min(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
516 for j in i:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
517 com2newcom[j] = newcom
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
518 return com2newcom
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
519
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
520 def adjust_cluster_size(self, proportion_kept, ids_kept):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
521 LOGGER.info("adjusting cluster sizes")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
522 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
523 c.execute("ALTER TABLE superclusters ADD COLUMN size_uncorrected INTEGER")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
524 c.execute("UPDATE superclusters SET size_uncorrected=size")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
525 if ids_kept:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
526 ids_kept_set = set(ids_kept)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
527 ratio = (1 - proportion_kept)/proportion_kept
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
528 for cl, size in c.execute("SELECT cluster,size FROM superclusters"):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
529 ids = self.get_cluster_reads(cl)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
530 ovl_size = len(ids_kept_set.intersection(ids))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
531 size_adjusted = int(len(ids) + ovl_size * ratio)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
532 if size_adjusted > size:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
533 c.execute("UPDATE superclusters SET size=? WHERE cluster=?",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
534 (size_adjusted, cl))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
535 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
536 LOGGER.info("adjusting cluster sizes - done")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
537
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
538 def export_cls(self, path):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
539 with open(path, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
540 for i in range(1, self.number_of_clusters + 1):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
541 ids = self.get_cluster_reads(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
542 f.write(">CL{}\t{}\n".format(i, len(ids)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
543 f.write("\t".join(ids))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
544 f.write("\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
545
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
546 def _calculate_pair_bond(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
547 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
548 out = c.execute("select c1, c2, ids from cluster_mate_connections")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
549 buffer = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
550 for c1, c2, ids in out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
551 w = len(set(ids.split(",")))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
552 n1 = len(set([i[:-1] for i in self.get_cluster_reads(c1)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
553 ])) * 2 - len(self.get_cluster_reads(c1))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
554 n2 = len(set([i[:-1] for i in self.get_cluster_reads(c2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
555 ])) * 2 - len(self.get_cluster_reads(c2))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
556 buffer.append((c1, c2, n1, n2, w, 2 * w / (n1 + n2)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
557 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
558 "CREATE TABLE cluster_mate_bond (c1 INTEGER, c2 INTEGER, n1 INTEGER, n2 INTEGER, w INTEGER, k FLOAT)")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
559 c.executemany(" INSERT INTO cluster_mate_bond values (?,?,?,?,?,?)",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
560 buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
561
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
562 def _add_empty_tables(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
563 '''This is used with reads that are not paired
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
564 - it creates empty mate tables, this is necessary for
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
565 subsequent reporting to work corectly '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
566 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
567 c.execute(("CREATE TABLE cluster_mate_bond (c1 INTEGER, c2 INTEGER, "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
568 "n1 INTEGER, n2 INTEGER, w INTEGER, k FLOAT)"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
569 c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
570 "CREATE TABLE cluster_mate_connections (c1 INTEGER, c2 INTEGER, N INTEGER, ids TEXT) ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
571
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
572 def get_cluster_supercluster(self, cluster):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
573 '''Get supercluster id for suplied cluster '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
574 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
575 out = c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
576 'SELECT supercluster FROM communities WHERE cluster="{0}" LIMIT 1'.format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
577 cluster))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
578 sc = out.fetchone()[0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
579 return sc
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
580
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
581 def get_cluster_reads(self, cluster):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
582
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
583 if self._cluster_list:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
584 return self._cluster_list[str(cluster)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
585 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
586 # if queried first time
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
587 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
588 out = c.execute("select cluster, vertexname from vertex_cluster")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
589 cluster_list = collections.defaultdict(list)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
590 for clusterindex, vertexname in out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
591 cluster_list[str(clusterindex)].append(vertexname)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
592 self._cluster_list = cluster_list
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
593 return self._cluster_list[str(cluster)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
594
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
595
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
596 def extract_cluster_blast(self, path, index, ids=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
597 ''' Extract blast for cluster and save it to path
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
598 return number of blast lines ( i.e. number of graph edges E)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
599 if ids is specified , only subset of blast is used'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
600 c = self.conn.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
601 if ids:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
602 vertexindex = (
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
603 "select vertexindex from vertices "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
604 "where vertexname in ({})").format('"' + '","'.join(ids) + '"')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
605
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
606 out = c.execute(("select * from graph where c1={0} and c2={0}"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
607 " and v1 in ({1}) and v2 in ({1})").format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
608 index, vertexindex))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
609 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
610 out = c.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
611 "select * from graph where c1={0} and c2={0}".format(index))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
612 E = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
613 N = len(self.get_cluster_reads(index))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
614 with open(path, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
615 for i in out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
616 print(self.vertexid2name[i[0]],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
617 self.vertexid2name[
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
618 i[1]],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
619 i[2],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
620 *i[4:13],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
621 sep='\t',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
622 file=f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
623 E += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
624 return E
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
625
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
626 def export_clusters_files_multiple(self,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
627 min_size,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
628 directory,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
629 sequences=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
630 tRNA_database_path=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
631 satellite_model_path=None):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
632 def load_fun(N, E):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
633 ''' estimate mem usage from graph size and density'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
634 NE = math.log(float(N) * float(E), 10)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
635 if NE > 11.5:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
636 return 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
637 if NE > 11:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
638 return 0.9
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
639 if NE > 10:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
640 return 0.4
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
641 if NE > 9:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
642 return 0.2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
643 if NE > 8:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
644 return 0.07
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
645 return 0.02
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
646
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
647 def estimate_sample_size(NV, NE, maxv, maxe):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
648 ''' estimat suitable sampling based on the graph density
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
649 NV,NE is |V| and |E| of the graph
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
650 maxv, maxe are maximal |V| and |E|'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
651
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
652 d = (2 * NE) / (NV * (NV - 1))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
653 eEst = (maxv * (maxv - 1) * d) / 2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
654 nEst = (d + math.sqrt(d**2 + 8 * d * maxe)) / (2 * d)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
655 if eEst >= maxe:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
656 N = int(nEst)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
657 if nEst >= maxv:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
658 N = int(maxv)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
659 return N
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
660
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
661 clusterindex = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
662 cluster_input_args = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
663 ppn = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
664 # is is comparative analysis?
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
665 if sequences.prefix_length:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
666 self.conn.execute("CREATE TABLE comparative_counts (clusterindex INTEGER,"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
667 + ", ".join(["[{}] INTEGER".format(i) for i in sequences.prefix_codes.keys()]) + ")")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
668 # do for comparative analysis
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
669
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
670 for cl in range(self.number_of_clusters):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
671 prefix_codes = dict((key, 0) for key in sequences.prefix_codes.keys())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
672 for i in self.get_cluster_reads(cl):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
673 prefix_codes[i[0:sequences.prefix_length]] += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
674 header = ", ".join(["[" + str(i) + "]" for i in prefix_codes.keys()])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
675 values = ", ".join([str(i) for i in prefix_codes.values()])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
676 self.conn.execute(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
677 "INSERT INTO comparative_counts (clusterindex, {}) VALUES ({}, {})".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
678 header, cl, values))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
679 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
680 prefix_codes = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
681
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
682 while True:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
683 read_names = self.get_cluster_reads(clusterindex)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
684 supercluster = self.get_cluster_supercluster(clusterindex)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
685 N = len(read_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
686 print("sequences.ids_kept -2 ")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
687 print(sequences.ids_kept)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
688 if sequences.ids_kept:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
689 N_adjusted = round(len(set(sequences.ids_kept).intersection(read_names)) *
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
690 ((1 - config.FILTER_PROPORTION_OF_KEPT) /
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
691 config.FILTER_PROPORTION_OF_KEPT) + N)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
692 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
693 N_adjusted = N
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
694 if N < min_size:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
695 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
696 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
697 LOGGER.info("exporting cluster {}".format(clusterindex))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
698 blast_file = "{dir}/dir_CL{i:04}/hitsort_part.csv".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
699 dir=directory, i=clusterindex)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
700 cluster_dir = "{dir}/dir_CL{i:04}".format(dir=directory,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
701 i=clusterindex)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
702 fasta_file = "{dir}/reads_selection.fasta".format(dir=cluster_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
703 fasta_file_full = "{dir}/reads.fasta".format(dir=cluster_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
704
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
705 os.makedirs(os.path.dirname(blast_file), exist_ok=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
706 E = self.extract_cluster_blast(index=clusterindex,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
707 path=blast_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
708 # check if blast must be sampled
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
709 n_sample = estimate_sample_size(NV=N,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
710 NE=E,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
711 maxv=config.CLUSTER_VMAX,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
712 maxe=config.CLUSTER_EMAX)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
713 LOGGER.info("directories created..")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
714 if n_sample < N:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
715 LOGGER.info(("cluster is too large - sampling.."
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
716 "original size: {N}\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
717 "sample size: {NS}\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
718 "").format(N=N, NS=n_sample))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
719 random.seed(self.seed)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
720 read_names_sample = random.sample(read_names, n_sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
721 LOGGER.info("reads id sampled...")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
722 blast_file_sample = "{dir}/dir_CL{i:04}/blast_sample.csv".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
723 dir=directory, i=clusterindex)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
724 E_sample = self.extract_cluster_blast(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
725 index=clusterindex,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
726 path=blast_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
727 ids=read_names_sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
728 LOGGER.info("numner of edges in sample: {}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
729 E_sample))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
730 sequences.save2fasta(fasta_file, subset=read_names_sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
731 sequences.save2fasta(fasta_file_full, subset=read_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
732
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
733 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
734 read_names_sample = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
735 E_sample = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
736 blast_file_sample = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
737 n_sample = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
738 sequences.save2fasta(fasta_file_full, subset=read_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
739 ## TODO - use symlink instead of :
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
740 sequences.save2fasta(fasta_file, subset=read_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
741 # export individual annotations tables:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
742 # annotation is always for full cluster
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
743 LOGGER.info("exporting cluster annotation")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
744 annotations = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
745 annotations_custom = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
746 for n in sequences.annotations:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
747 print("sequences.annotations:", n)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
748 if n.find("custom_db") == 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
749 print("custom")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
750 annotations_custom[n] = sequences.save_annotation(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
751 annotation_name=n,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
752 subset=read_names,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
753 dir=cluster_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
754 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
755 print("built in")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
756 annotations[n] = sequences.save_annotation(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
757 annotation_name=n,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
758 subset=read_names,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
759 dir=cluster_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
760
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
761 cluster_input_args.append([
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
762 n_sample, N,N_adjusted, blast_file, fasta_file, fasta_file_full,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
763 clusterindex, supercluster, self.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
764 tRNA_database_path, satellite_model_path, sequences.prefix_codes,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
765 prefix_codes, annotations, annotations_custom
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
766 ])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
767 clusterindex += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
768 ppn.append(load_fun(N, E))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
769
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
770
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
771
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
772 self.conn.commit()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
773
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
774 # run in parallel:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
775 # reorder jobs based on the ppn:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
776 cluster_input_args = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
777 x
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
778 for (y, x) in sorted(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
779 zip(ppn, cluster_input_args),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
780 key=lambda pair: pair[0],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
781 reverse=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
782 ]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
783 ppn = sorted(ppn, reverse=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
784 LOGGER.info("creating clusters in parallel")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
785 clusters_info = parallel(Cluster,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
786 *[list(i) for i in zip(*cluster_input_args)],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
787 ppn=ppn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
788 # sort it back:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
789 clusters_info = sorted(clusters_info, key=lambda cl: cl.index)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
790 return clusters_info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
791
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
792
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
793 class Cluster():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
794 ''' store and show information about cluster properties '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
795
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
796 def __init__(self,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
797 size,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
798 size_real,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
799 size_adjusted,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
800 blast_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
801 fasta_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
802 fasta_file_full,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
803 index,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
804 supercluster,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
805 paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
806 tRNA_database_path,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
807 satellite_model_path,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
808 all_prefix_codes,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
809 prefix_codes,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
810 annotations,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
811 annotations_custom={},
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
812 loop_index_threshold=0.7,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
813 pair_completeness_threshold=0.40,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
814 loop_index_unpaired_threshold=0.85):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
815 if size:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
816 # cluster was scaled down
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
817 self.size = size
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
818 self.size_real = size_real
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
819 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
820 self.size = self.size_real = size_real
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
821 self.size_adjusted = size_adjusted
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
822 self.filtered = True if size_adjusted != size_real else False
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
823 self.all_prefix_codes = all_prefix_codes.keys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
824 self.prefix_codes = prefix_codes
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
825 self.dir = FilePath(os.path.dirname(blast_file))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
826 self.blast_file = FilePath(blast_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
827 self.fasta_file = FilePath(fasta_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
828 self.fasta_file_full = FilePath(fasta_file_full)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
829 self.index = index
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
830 self.assembly_files = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
831 self.ltr_detection = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
832 self.supercluster = supercluster
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
833 self.annotations_files = annotations
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
834 self.annotations_files_custom = annotations_custom
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
835 self.annotations_summary, self.annotations_table = self._summarize_annotations(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
836 annotations, size_real)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
837 # add annotation
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
838 if len(annotations_custom):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
839 self.annotations_summary_custom, self.annotations_custom_table = self._summarize_annotations(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
840 annotations_custom, size_real)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
841 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
842 self.annotations_summary_custom, self.annotations_custom_table = "", ""
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
843
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
844 self.paired = paired
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
845 self.graph_file = FilePath("{0}/graph_layout.GL".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
846 self.directed_graph_file = FilePath(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
847 "{0}/graph_layout_directed.RData".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
848 self.fasta_oriented_file = FilePath("{0}/reads_selection_oriented.fasta".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
849 self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
850 self.image_file = FilePath("{0}/graph_layout.png".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
851 self.image_file_tmb = FilePath("{0}/graph_layout_tmb.png".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
852 self.html_report_main = FilePath("{0}/index.html".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
853 self.html_report_files = FilePath("{0}/html_files".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
854 self.supercluster_best_hit = "NA"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
855 TAREAN = r2py.R(config.RSOURCE_tarean)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
856 LOGGER.info("creating graph no.{}".format(self.index))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
857 # if FileType muast be converted to str for rfunctions
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
858 graph_info = eval(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
859 TAREAN.mgblast2graph(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
860 self.blast_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
861 seqfile=self.fasta_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
862 seqfile_full=self.fasta_file_full,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
863 graph_destination=self.graph_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
864 directed_graph_destination=self.directed_graph_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
865 oriented_sequences=self.fasta_oriented_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
866 image_file=self.image_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
867 image_file_tmb=self.image_file_tmb,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
868 repex=True,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
869 paired=self.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
870 satellite_model_path=satellite_model_path,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
871 maxv=config.CLUSTER_VMAX,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
872 maxe=config.CLUSTER_EMAX)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
873 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
874 print(graph_info)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
875 self.ecount = graph_info['ecount']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
876 self.vcount = graph_info['vcount']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
877 self.loop_index = graph_info['loop_index']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
878 self.pair_completeness = graph_info['pair_completeness']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
879 self.orientation_score = graph_info['escore']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
880 self.satellite_probability = graph_info['satellite_probability']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
881 self.satellite = graph_info['satellite']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
882 # for paired reads:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
883 cond1 = (self.paired and self.loop_index > loop_index_threshold and
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
884 self.pair_completeness > pair_completeness_threshold)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
885 # no pairs
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
886 cond2 = ((not self.paired) and
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
887 self.loop_index > loop_index_unpaired_threshold)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
888 if (cond1 or cond2) and config.ARGS.options.name != "oxford_nanopore":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
889 self.putative_tandem = True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
890 self.dir_tarean = FilePath("{}/tarean".format(self.dir))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
891 lock_file = self.dir + "../lock"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
892 out = eval(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
893 TAREAN.tarean(input_sequences=self.fasta_oriented_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
894 output_dir=self.dir_tarean,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
895 CPU=1,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
896 reorient_reads=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
897 tRNA_database_path=tRNA_database_path,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
898 lock_file=lock_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
899 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
900 self.html_tarean = FilePath(out['htmlfile'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
901 self.tarean_contig_file = out['tarean_contig_file']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
902 self.TR_score = out['TR_score']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
903 self.TR_monomer_length = out['TR_monomer_length']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
904 self.TR_consensus = out['TR_consensus']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
905 self.pbs_score = out['pbs_score']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
906 self.max_ORF_length = out['orf_l']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
907 if (out['orf_l'] > config.ORF_THRESHOLD or
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
908 out['pbs_score'] > config.PBS_THRESHOLD):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
909 self.tandem_rank = 3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
910 elif self.satellite:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
911 self.tandem_rank = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
912 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
913 self.tandem_rank = 2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
914 # some tandems could be rDNA genes - this must be check
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
915 # by annotation
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
916 if self.annotations_table:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
917 rdna_score = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
918 contamination_score = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
919 for i in self.annotations_table:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
920 if 'rDNA/' in i[0]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
921 rdna_score += i[1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
922 if 'contamination' in i[0]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
923 contamination_score += i[1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
924 if rdna_score > config.RDNA_THRESHOLD:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
925 self.tandem_rank = 4
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
926 if contamination_score > config.CONTAMINATION_THRESHOLD:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
927 self.tandem_rank = 0 # other
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
928
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
929 # by custom annotation - castom annotation has preference
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
930 if self.annotations_custom_table:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
931 print("custom table searching")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
932 rdna_score = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
933 contamination_score = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
934 print(self.annotations_custom_table)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
935 for i in self.annotations_custom_table:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
936 if 'rDNA' in i[0]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
937 rdna_score += i[1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
938 if 'contamination' in i[0]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
939 contamination_score += i[1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
940 if rdna_score > 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
941 self.tandem_rank = 4
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
942 if contamination_score > config.CONTAMINATION_THRESHOLD:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
943 self.tandem_rank = 0 # other
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
944
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
945 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
946 self.putative_tandem = False
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
947 self.dir_tarean = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
948 self.html_tarean = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
949 self.TR_score = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
950 self.TR_monomer_length = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
951 self.TR_consensus = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
952 self.pbs_score = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
953 self.max_ORF_length = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
954 self.tandem_rank = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
955 self.tarean_contig_file = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
956
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
957 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
958 out = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
959 "cluster no {}:".format(self.index),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
960 "Number of vertices : {}".format(self.size),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
961 "Number of edges : {}".format(self.ecount),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
962 "Loop index : {}".format(self.loop_index),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
963 "Pair completeness : {}".format(self.pair_completeness),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
964 "Orientation score : {}".format(self.orientation_score)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
965 ]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
966 return "\n".join(out)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
967
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
968 def listing(self, asdict=True):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
969 ''' convert attributes to dictionary for printing purposes'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
970 out = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
971 for i in dir(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
972 # do not show private
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
973 if i[:2] != "__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
974 value = getattr(self, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
975 if not callable(value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
976 # for dictionary
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
977 if isinstance(value, dict):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
978 for k in value:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
979 out[i + "_" + k] = value[k]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
980 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
981 out[i] = value
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
982 if asdict:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
983 return out
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
984 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
985 return {'keys': list(out.keys()), 'values': list(out.values())}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
986
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
987 def detect_ltr(self, trna_database):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
988 '''detection of ltr in assembly files, output of analysis is stored in file'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
989 CREATE_ANNOTATION = r2py.R(config.RSOURCE_create_annotation, verbose=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
990 if self.assembly_files['{}.{}.ace']:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
991 ace_file = self.assembly_files['{}.{}.ace']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
992 print(ace_file, "running LTR detection")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
993 fout = "{}/{}".format(self.dir, config.LTR_DETECTION_FILES['BASE'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
994 subprocess.check_call([
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
995 config.LTR_DETECTION,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
996 '-i', ace_file,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
997 '-o', fout,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
998 '-p', trna_database])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
999 # evaluate LTR presence
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1000 fn = "{}/{}".format(self.dir, config.LTR_DETECTION_FILES['PBS_BLAST'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1001 self.ltr_detection = CREATE_ANNOTATION.evaluate_LTR_detection(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1002
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1003
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1004 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1005 def _summarize_annotations(annotations_files, size):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1006 ''' will tabulate annotation results '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1007 # TODO
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1008 summaries = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1009 # weight is in percentage
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1010 weight = 100 / size
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1011 for i in annotations_files:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1012 with open(annotations_files[i]) as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1013 header = f.readline().split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1014 id_index = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1015 i for i, item in enumerate(header) if item == "db_id"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1016 ][0]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1017 for line in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1018 classification = line.split()[id_index].split("#")[1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1019 if classification in summaries:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1020 summaries[classification] += weight
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1021 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1022 summaries[classification] = weight
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1023 # format summaries for printing
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1024 annotation_string = ""
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1025 annotation_table = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1026 for i in sorted(summaries.items(), key=lambda x: x[1], reverse=True):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1027 ## hits with smaller proportion are not shown!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1028 if i[1] > 0.1:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1029 if i[1] > 1:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1030 annotation_string += "<b>{1:.2f}% {0}</b>\n".format(*i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1031 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1032 annotation_string += "{1:.2f}% {0}\n".format(*i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1033 annotation_table.append(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1034 return [annotation_string, annotation_table]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1035
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1036 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1037 def add_cluster_table_to_database(cluster_table, db_path):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1038 '''get column names from Cluster object and create
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1039 correspopnding table in database values from all
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1040 clusters are filled to database'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1041 column_name_and_type = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1042 column_list = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1043
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1044 # get all atribute names -> they are column names
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1045 # in sqlite table, detect proper sqlite type
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1046 def identity(x):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1047 return (x)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1048
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1049 for i in cluster_table[1]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1050 t = type(cluster_table[1][i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1051 if t == int:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1052 sqltype = "integer"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1053 convert = identity
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1054 elif t == float:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1055 sqltype = "real"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1056 convert = identity
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1057 elif t == bool:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1058 sqltype = "boolean"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1059 convert = bool
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1060 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1061 sqltype = "text"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1062 convert = str
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1063 column_name_and_type += ["[{}] {}".format(i, sqltype)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1064 column_list += [tuple((i, convert))]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1065 header = ", ".join(column_name_and_type)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1066 db = sqlite3.connect(db_path)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1067 c = db.cursor()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1068 print("CREATE TABLE cluster_info ({})".format(header))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1069 c.execute("CREATE TABLE cluster_info ({})".format(header))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1070 # file data to cluster_table
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1071 buffer = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1072 for i in cluster_table:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1073 buffer.append(tuple('{}'.format(fun(i[j])) for j, fun in
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1074 column_list))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1075 wildcards = ",".join(["?"] * len(column_list))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1076 print(buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1077 c.executemany("insert into cluster_info values ({})".format(wildcards),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1078 buffer)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1079 db.commit()