annotate seqclust @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 ''' TAndem REpeat ANalyzer '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 import os
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 import sys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 import shutil
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 import subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 import argparse
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 from argparse import RawTextHelpFormatter
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 import logging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 import shlex
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 import multiprocessing
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 # config must be loaded before seqtools,...
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 import config
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14 import re
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 from lib import seqtools, graphtools, utils, assembly_tools
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 from lib import r2py
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 REQUIRED_VERSION = (3, 4)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 if sys.version_info < REQUIRED_VERSION:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 raise Exception("\n\npython 3.4 or higher is required!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 # append path to louvain clustering and other binaries
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 os.environ['PATH'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26 LOGGER = logging.getLogger(__name__)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 def get_version(path, tarean_mode):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 shell=True,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 cwd=path).decode('ascii').strip()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 shorthash = subprocess.check_output(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 "git log --pretty=format:'%h' -n 1 ",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 shell=True,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 cwd=path).decode('ascii').strip()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 revcount = len(subprocess.check_output(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 "git log --oneline", shell=True,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 cwd=path).decode('ascii').split())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 tag = subprocess.check_output("git describe --tags --abbrev=0",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 cwd=path,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 shell=True).decode('ascii').strip()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 except subprocess.CalledProcessError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 tag = " "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 version_string = (
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 "-------------------------------------"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 "-------------------------------------\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 "PIPELINE VERSION : "
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 "{branch}-{tag}-{revcount}({shorthash})\n\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 "PROTEIN DATABASE VERSION : {PD}\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 " md5 checksum : {PDmd5}\n\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 "DNA DATABASE VERSION : {DD}\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 " md5 checksum : {DDmd5}\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 "-------------------------------------"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 "-------------------------------------\n").format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 branch=branch,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 shorthash=shorthash,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 revcount=revcount,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 tag=tag,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 PD=os.path.basename(config.PROTEIN_DATABASE),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 PDmd5=utils.md5checksum(config.PROTEIN_DATABASE + ".psq", fail_if_missing = not tarean_mode),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 DD=os.path.basename(config.DNA_DATABASE),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 DDmd5=utils.md5checksum(config.DNA_DATABASE + ".nsq"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 except subprocess.CalledProcessError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 version_string = "version of pipeline not available!"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 LOGGER.info(version_string)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 return version_string
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 def valid_database(database_file):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 with open(database_file, 'r', encoding='ascii') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 if i[0] == ">":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 if not re.match(">.+#.+/*", i):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 # TODO - make edits to correct fomating of custom database???
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 return False
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 return True
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 def add_databases(databases, custom_databases_dir, dbtype='nucl'):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 '''custom databases are copied to directory tree and blast
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 database is created using makeblastdb
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 databases_ok = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 print(databases)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 for db_path, db_name in databases:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92 db_destination = "{}/{}".format(custom_databases_dir, db_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 shutil.copyfile(db_path, db_destination)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 if not valid_database(db_destination):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 raise ValueError((
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 "\n------------------------------------------------------------\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 "Custom database is not valid!\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 "Custom database of repeats are DNA sequences in fasta format.\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 "The required format for IDs in a custom library is : \n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 " '>reapeatname#class/subclass'\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 "Reformat the database and try again!\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 "-------------------------------------------------------------\n\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103 ))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106 dbtype)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 print(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 args = shlex.split(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 print(args)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 if subprocess.check_call(args, stderr=sys.stdout):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 Warning("makeblastdb on {} failed".format(db_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113 databases_ok.append([db_destination, "custom_db_" + db_name])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 if len(databases_ok) == 0:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115 return None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117 return databases_ok
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
120 def meminfo():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
121 ''' detect physical memory and memory usage'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
122 info = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
123 required_fields = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
124 'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
125 ]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
126 with open('/proc/meminfo', 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
127 for i in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
128 a = i.split()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
129 if a[0] in required_fields:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
130 info[a[0]] = int(a[1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
131 return info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
132
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
133
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
134 def dict2lists(d):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
135 ''' convert dict to nested list
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
136 use the funsction to pass dictionary to R function
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
137 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
138 values = list(d.values())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
139 keys = list(d.keys())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
140 return [values, keys]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
141
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
142
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
143 def show_object(obj):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
144 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
145 helper function for printing all public atributes,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
146 does not print callebme atributes e.i. methods..
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
147 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
148
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
149 s = "Configuration--------------->\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
150 for i in dir(obj):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
151 # do not show private
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
152 if i[:2] != "__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
153 value = getattr(obj, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
154 if not callable(value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
155 s += "{} : {}\n".format(i, value)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
156 s += "<---------------configuration\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
157 return s
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
158
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
159
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
160 class DataInfo():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
161 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
162 stores information state of clustering and data
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
163 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
164
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
165 def __init__(self, args, paths):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
166 LOGGER.info("getting information about input sequences")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
167 self.args = args
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
168 self.working_directory = args.output_dir
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
169 self.input_sequences = args.sequences.name
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
170 self.number_of_input_sequences = seqtools.SequenceSet.fasta_length(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
171 self.input_sequences)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
172 self.paired = args.paired
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
173 self.prefix_length = args.prefix_length
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
174 self.physical_memory = meminfo()['MemTotal:']
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
175 self.edges_max = config.EMAX
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
176 # set max memory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
177 if args.max_memory:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
178 self.max_memory = args.max_memory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
179 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
180 self.max_memory = meminfo()["MemTotal:"]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
181 # modify initial setup if number of sequences is low
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
182 if args.automatic_filtering:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
183 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
184
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
185 if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
186 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
187
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
188 # is number of input sequences sufficient
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
189 if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
190 raise WrongInputDataError(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
191 "provide more sequences for clustering, minumum {} is .required".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
192 config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
193 # these atribudes will be set later after clustering is done
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
194 self.max_annotated_clusters = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
195 self.max_annotated_superclusters = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
196 # the atributes will be set after prerun is performed
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
197 self.prerun_ecount = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
198 self.prerun_ecount_corrected = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
199 self.sample_size = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
200 self.max_number_reads_for_clustering = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
201 self.mincln = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
202 self.number_of_omitted_reads = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
203 LOGGER.info("sampling sequences for prerun analysis")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
204 sample = seqtools.SequenceSet(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
205 source=self.input_sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
206 sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
207 paired=self.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
208 filename=paths.sample_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
209 fasta=paths.sample_fasta,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
210 rename=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
211 sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
212 # preliminary clustering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
213 self.prerun_vcount = len(sample)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
214 # line count
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
215 self._prerun(sample, paths)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
216 # adjust size of chunks:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
217 if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
218 config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
219
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
220 def _prerun(self, sample, paths):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
221 '''Preliminary characterization sequences using
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
222 clustering on small dataset - stored as sample '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
223 sample.make_chunks(chunk_size=1000)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
224 sample.create_hitsort(options=self.args.options)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
225 sample_hitsort = graphtools.Graph(source=sample.hitsort,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
226 paired=self.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
227 seqids=sample.keys())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
228 sample_hitsort.save_indexed_graph()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
229 sample_hitsort.louvain_clustering(merge_threshold=0.2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
230 sample_hitsort.export_cls(path=paths.prerun_cls_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
231 sample.annotate(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
232 config.DNA_DATABASE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
233 annotation_name="dna_database",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
234 directory=paths.prerun,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
235 params=self.args.options.annotation_search_params.blastn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
236
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
237 selected_tarean_contigs = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
238 ecount_corrected = sample_hitsort.ecount
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
239 vcount_corrected = sample_hitsort.vcount
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
240 if self.args.automatic_filtering:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
241 prerun_cluster_info = sample_hitsort.export_clusters_files_multiple(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
242 min_size=10,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
243 directory=paths.prerun_clusters,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
244 sequences=sample,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
245 tRNA_database_path=config.TRNA_DATABASE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
246 satellite_model_path=config.SATELLITE_MODEL)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
247 # check of prerun contain clusters with large number of edges
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
248 # these sequences can be used for filtering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
249 for cl in prerun_cluster_info:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
250 print(cl.ecount, cl.vcount, sample_hitsort.ecount,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
251 cl.tandem_rank)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
252
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
253 if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
254 cl.ecount / sample_hitsort.ecount >
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
255 config.FILTER_MIN_PROP_THRESHOLD and
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
256 cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
257 selected_tarean_contigs.append(cl.tarean_contig_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
258 ecount_corrected -= cl.ecount
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
259 vcount_corrected -= cl.vcount
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
260
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
261 if selected_tarean_contigs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
262 with open(paths.filter_sequences_file, 'w') as out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
263 for fname in selected_tarean_contigs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
264 with open(fname, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
265 out.write(f.read())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
266 self.sequence_fiter = paths.filter_sequences_file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
267 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
268 self.sequence_fiter = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
269
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
270 self.prerun_ecount = sample_hitsort.ecount
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
271 self.prerun_ecount_corrected = ecount_corrected
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
272 self.prerun_vcount_corrected = vcount_corrected
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
273 self.max_number_reads_for_clustering = round((
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
274 ((self.edges_max * self.max_memory) /
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
275 self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
276
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
277 if self.max_number_reads_for_clustering >= self.number_of_input_sequences:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
278 self.sample_size = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
279 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
280 self.sample_size = self.max_number_reads_for_clustering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
281
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
282 n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
283 n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
284 self.number_of_reads_for_clustering = min(n1, n2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
285 # minlcn is set either based on mincl or value specified in config,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
286 # whatever is higher
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
287 self.mincln = int(self.number_of_reads_for_clustering *
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
288 self.args.mincl / 100)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
289 if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
290 self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
291
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
292 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
293 s = "Data info------------------->\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
294 for i in dir(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
295 # do not show private
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
296 if i[:2] != "__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
297 value = getattr(self, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
298 if not callable(value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
299 s += "{} : {}\n".format(i, value)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
300 s += "<----------------------Data info\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
301 return s
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
302
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
303
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
304 class DataFiles(object):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
305 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
306 stores location of data files and create directories ...
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
307 atributes are:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
308 - individual directories
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
309 - individual files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
310 - list of files or directories
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
311
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
312 directories are created if does not exist
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
313 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
314
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
315 def __init__(self, working_dir, subdirs, files):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
316 LOGGER.info("creating directory structure")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
317 self.working_dir = working_dir
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
318 # add and create directories paths
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
319 for i in subdirs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
320 d = os.path.join(self.working_dir, subdirs[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
321 os.makedirs(d, exist_ok=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
322 setattr(self, i, d)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
323 setattr(self, i + "__relative", subdirs[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
324 # add file paths
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
325 for i in files:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
326 d = os.path.join(self.working_dir, files[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
327 setattr(self, i, d)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
328 setattr(self, i + "__relative", files[i])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
329
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
330 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
331 s = ""
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
332 for i in dir(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
333 # do not show private
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
334 if i[:2] != "__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
335 value = getattr(self, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
336 if not callable(value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
337 s += "{} : {}\n".format(i, value)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
338 return s
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
339
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
340 def as_list(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
341 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
342 convert attr and vaues to list - suitable for passing values to R functions
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
343 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
344 values = list()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
345 keys = list()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
346 for i in dir(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
347 # do not show private
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
348 if i[:2] != "__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
349 value = getattr(self, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
350 if not callable(value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
351 values.append(value)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
352 keys.append(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
353 return [values, keys]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
354
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
355 def cleanup(self, paths):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
356 ''' will remove unnecessary files from working directory '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
357 for i in paths:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
358 fn = getattr(self, i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
359 if os.path.exists(fn):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
360 if os.path.isdir(fn):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
361 shutil.rmtree(fn, ignore_errors=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
362 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
363 os.remove(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
364
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
365
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
366 class WrongInputDataError(Exception):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
367 '''Custom exception for wrong input
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
368 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
369
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
370 def __init__(self, arg):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
371 super(WrongInputDataError, self).__init__(arg)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
372 self.msg = arg
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
373
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
374
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
375 class Range():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
376 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
377 This class is used to check float range in argparse
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
378 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
379
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
380 def __init__(self, start, end):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
381 self.start = start
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
382 self.end = end
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
383
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
384 def __eq__(self, other):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
385 return self.start <= other <= self.end
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
386
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
387 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
388 return "float range {}..{}".format(self.start, self.end)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
389
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
390 def __repr__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
391 return "float range {}..{}".format(self.start, self.end)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
392
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
393
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
394 class DirectoryType(object):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
395 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
396 this class is similar to argparse.FileType
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
397 for mode 'w' creates and check the access to the directory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
398 for mode 'r' check the presence of the dictory and accesibility
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
399 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
400
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
401 def __init__(self, mode='r'):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
402 self._mode = mode
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
403
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
404 def __call__(self, string):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
405 if self._mode == 'w':
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
406 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
407 os.makedirs(string, exist_ok=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
408 except FileExistsError:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
409 raise argparse.ArgumentTypeError(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
410 "Cannot create directory, '{}' is a file".format(string))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
411 if os.access(string, os.W_OK):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
412 return string
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
413 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
414 raise argparse.ArgumentTypeError(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
415 "Directory '{}' is not writable".format(string))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
416 if self._mode == 'r':
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
417 if not os.path.isdir(string):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
418 raise argparse.ArgumentTypeError(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
419 "'{}' is not a directory".format(string))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
420 if os.access(string, os.R_OK):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
421 return string
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
422 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
423 raise argparse.ArgumentTypeError(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
424 "Directory '{}' is not readable".format(string))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
425
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
426
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
427 def get_cmdline_args():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
428 '''seqclust command line parser'''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
429
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
430 description = """RepeatExplorer:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
431 Repetitive sequence discovery and clasification from NGS data
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
432
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
433 """
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
434
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
435 # arguments parsing
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
436 parser = argparse.ArgumentParser(description=description,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
437 formatter_class=RawTextHelpFormatter)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
438 parser.add_argument('-p', '--paired', action='store_true', default=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
439 parser.add_argument('-A',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
440 '--automatic_filtering',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
441 action='store_true',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
442 default=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
443 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
444 '-t',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
445 '--tarean_mode',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
446 action='store_true',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
447 default=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
448 help="analyze only tandem reapeats without additional classification")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
449 parser.add_argument('sequences', type=argparse.FileType('r'))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
450 parser.add_argument('-l',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
451 '--logfile',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
452 type=argparse.FileType('w'),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
453 default=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
454 help='log file, logging goes to stdout if not defines')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
455 parser.add_argument('-m',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
456 '--mincl',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
457 type=float,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
458 choices=[Range(0.0, 100.0)],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
459 default=0.01)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
460 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
461 '-M',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
462 '--merge_threshold',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
463 type=float,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
464 choices=[0, Range(0.1, 1)],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
465 default=0,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
466 help=
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
467 "threshold for mate-pair based cluster merging, default 0 - no merging")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
468 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
469 '-o',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
470 '--min_lcov',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
471 type=float,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
472 choices=[Range(30.0, 80.0)],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
473 default=55,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
474 help=
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
475 "minimal overlap coverage - relative to longer sequence length, default 55")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
476 parser.add_argument('-c',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
477 '--cpu',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
478 type=int,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
479 default=int(os.environ.get('TAREAN_CPU', 0)),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
480 help="number of cpu to use, if 0 use max available")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
481 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
482 '-s',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
483 '--sample',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
484 type=int,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
485 default=0,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
486 help="use only sample of input data[by default max reads is used")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
487 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
488 '-P',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
489 '--prefix_length',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
490 type=int,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
491 default=0,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
492 help=("If you wish to keep part of the sequences name,\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
493 " enter the number of characters which should be \n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
494 "kept (1-10) instead of zero. Use this setting if\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
495 " you are doing comparative analysis"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
496 parser.add_argument('-v',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
497 '--output_dir',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
498 type=DirectoryType('w'),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
499 default="clustering_results")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
500 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
501 '-r',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
502 '--max_memory',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
503 type=int,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
504 default=int(os.environ.get('TAREAN_MAX_MEM', 0)),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
505 help=("Maximal amount of available RAM in kB if not set\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
506 "clustering tries to use whole available RAM"))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
507 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
508 '-d',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
509 '--database',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
510 default=None,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
511 help="fasta file with database for annotation and name of database",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
512 nargs=2,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
513 action='append')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
514
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
515 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
516 "-C",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
517 "--cleanup",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
518 default=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
519 action="store_true",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
520 help="remove unncessary large files from working directory")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
521
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
522 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
523 "-k",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
524 "--keep_names",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
525 default=False,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
526 action="store_true",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
527 help="keep sequence names, by default sequences are renamed")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
528
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
529 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
530 '-a', '--assembly_min',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
531 default=5, type=int,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
532 choices=[2,3,4,5],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
533 help=('Assembly is performed on individual clusters, by default \n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
534 'clusters with size less then 5 are not assembled. If you \n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
535 'want need assembly of smaller cluster set *assmbly_min* \n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
536 'accordingly\n')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
537 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
538
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
539 parser.add_argument('-tax',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
540 '--taxon',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
541 default=config.PROTEIN_DATABASE_DEFAULT,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
542 choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
543 help="Select taxon and protein database version"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
544 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
545
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
546 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
547 '-opt',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
548 '--options',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
549 default="ILLUMINA",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
550 choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
551
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
552 parser.add_argument(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
553 '-D',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
554 '--domain_search',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
555 default="BLASTX_W3",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
556 choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'],
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
557 help=
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
558 ('Detection of protein domains can be performed by either blastx or\n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
559 ' diamond" program. options are:\n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
560 ' BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
561 ' BLASTX_W3 - blastx with word size 3 (default)\n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
562 ' DIAMOND - diamond program (significantly faster, less sensitive)\n'
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
563 'To use this option diamond program must be installed in your PATH'))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
564
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
565 args = parser.parse_args()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
566
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
567 # covert option string to namedtuple of options
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
568 args.options = getattr(config, args.options)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
569 # set protein database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
570 args.options = args.options._replace(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
571 annotation_search_params=
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
572 args.options.annotation_search_params._replace(blastx=getattr(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
573 config, args.domain_search)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
574 return args
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
575
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
576
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
577 def main():
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
578 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
579 Perform graph based clustering
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
580 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
581 # argument parsing:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
582 args = get_cmdline_args()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
583 config.ARGS = args
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
584 logfile = args.logfile.name if args.logfile else None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
585 logging.basicConfig(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
586 filename=logfile,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
587 format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n',
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
588 level=logging.INFO)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
589 config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
590 args.taxon]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
591 # number of CPU to use
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
592 pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
593 config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
594 # TODO add kmer range specification to config - based on the technology
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
595 r2py.create_connection()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
596 try:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
597 reporting = r2py.R(config.RSOURCE_reporting, verbose=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
598 create_annotation = r2py.R(config.RSOURCE_create_annotation,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
599 verbose=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
600 LOGGER.info(args)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
601 paths = DataFiles(working_dir=args.output_dir,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
602 subdirs=config.DIRECTORY_TREE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
603 files=config.FILES)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
604 # files to be included in output
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
605 for src, dest in config.INCLUDE:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
606 shutil.copy(src, os.path.join(paths.working_dir, dest))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
607 # geting information about data
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
608 run_info = DataInfo(args, paths)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
609 LOGGER.info(run_info)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
610 LOGGER.info(show_object(config))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
611 # load all sequences or sample
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
612 sequences = seqtools.SequenceSet(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
613 source=run_info.input_sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
614 sample_size=run_info.number_of_reads_for_clustering,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
615 paired=run_info.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
616 filename=paths.sequences_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
617 fasta=paths.sequences_fasta,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
618 prefix_length=run_info.prefix_length,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
619 rename=not run_info.args.keep_names)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
620 if run_info.sequence_fiter:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
621 n = sequences.remove_sequences_using_filter(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
622 run_info.sequence_fiter,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
623 keep_proportion=config.FILTER_PROPORTION_OF_KEPT,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
624 omitted_sequences_file=paths.filter_omitted,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
625 kept_sequences_file=paths.filter_kept
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
626 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
627 run_info.number_of_omitted_reads = n
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
628 # add custom databases if provided
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
629 if args.database:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
630 config.CUSTOM_DNA_DATABASE = add_databases(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
631 args.database,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
632 custom_databases_dir=paths.custom_databases)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
633 sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
634 LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
635 sequences.make_chunks(chunk_size=config.CHUNK_SIZE)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
636 sequences.create_hitsort(output=paths.hitsort, options=args.options)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
637 hitsort = graphtools.Graph(filename=paths.hitsort_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
638 source=paths.hitsort,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
639 paired=run_info.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
640 seqids=sequences.keys())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
641
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
642 LOGGER.info('hitsort with {} reads and {} edges loaded.'.format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
643 hitsort.vcount, hitsort.ecount))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
644
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
645 hitsort.save_indexed_graph()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
646 LOGGER.info('hitsort index created.')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
647
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
648 hitsort.louvain_clustering(merge_threshold=args.merge_threshold,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
649 cleanup=args.cleanup)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
650 hitsort.export_cls(path=paths.cls_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
651 hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
652 sequences.ids_kept)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
653 sequences.annotate(config.DNA_DATABASE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
654 annotation_name="dna_database",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
655 directory=paths.blastn,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
656 params=args.options.annotation_search_params.blastn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
657
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
658 if config.CUSTOM_DNA_DATABASE:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
659 LOGGER.info('annotating with custom database')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
660 for db, db_name in config.CUSTOM_DNA_DATABASE:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
661 sequences.annotate(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
662 db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
663 annotation_name=db_name,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
664 directory=paths.blastn,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
665 params=args.options.annotation_search_params.blastn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
666
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
667 if not args.tarean_mode:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
668 # additional analyses - full RE run
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
669 # this must be finished befor creating clusters_info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
670 sequences.annotate(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
671 config.PROTEIN_DATABASE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
672 annotation_name="protein_database",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
673 directory=paths.blastx,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
674 params=args.options.annotation_search_params.blastx)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
675
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
676 ## annotating using customa databasesreplace
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
677 LOGGER.info('creating cluster graphs')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
678 clusters_info = hitsort.export_clusters_files_multiple(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
679 min_size=run_info.mincln,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
680 directory=paths.clusters,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
681 sequences=sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
682 tRNA_database_path=config.TRNA_DATABASE,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
683 satellite_model_path=config.SATELLITE_MODEL)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
684 if not args.tarean_mode:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
685 LOGGER.info("assembling..")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
686 assembly_tools.assembly(sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
687 hitsort,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
688 clusters_info,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
689 assembly_dir=paths.assembly,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
690 contigs_file=paths.contigs,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
691 min_size_of_cluster_for_assembly=args.assembly_min)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
692
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
693 LOGGER.info("detecting LTR in assembly..")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
694 for i in clusters_info:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
695 i.detect_ltr(config.TRNA_DATABASE)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
696
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
697 run_info.max_annotated_clusters = max([i.index for i in clusters_info])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
698 run_info.max_annotated_superclusters = max([i.supercluster
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
699 for i in clusters_info])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
700 # make reports
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
701 cluster_listing = [i.listing() for i in clusters_info]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
702 # make path relative to paths.cluster_info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
703 utils.save_as_table(cluster_listing, paths.clusters_info)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
704 # creates table cluster_info in hitsort database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
705 graphtools.Cluster.add_cluster_table_to_database(cluster_listing,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
706 paths.hitsort_db)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
707 # export files for consensus sequences, one for each ranks
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
708 consensus_files = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
709 for i in config.TANDEM_RANKS:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
710 consensus_files.append(utils.export_tandem_consensus(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
711 clusters_info,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
712 path=paths.TR_consensus_fasta.format(i),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
713 rank=i))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
714
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
715 if not args.tarean_mode:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
716 LOGGER.info("Creating report for superclusters")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
717 create_annotation.create_all_superclusters_report(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
718 max_supercluster=run_info.max_annotated_superclusters,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
719 paths=paths.as_list(),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
720 libdir=paths.libdir,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
721 superclusters_dir=paths.superclusters,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
722 seqdb=paths.sequences_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
723 hitsortdb=paths.hitsort_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
724 classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
725 HTML_LINKS=dict2lists(config.HTML_LINKS))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
726
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
727 LOGGER.info("Creating report for individual clusters")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
728 for cluster in clusters_info:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
729 create_annotation.create_cluster_report(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
730 cluster.index,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
731 seqdb=paths.sequences_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
732 hitsortdb=paths.hitsort_db,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
733 classification_hierarchy_file=
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
734 config.CLASSIFICATION_HIERARCHY,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
735 HTML_LINKS=dict2lists(config.HTML_LINKS))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
736
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
737 LOGGER.info("Creating main html report")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
738 reporting.create_main_reports(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
739 paths=paths.as_list(),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
740 N_clustering=run_info.number_of_reads_for_clustering,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
741 N_input=run_info.number_of_input_sequences,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
742 N_omit=run_info.number_of_omitted_reads,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
743 merge_threshold=args.merge_threshold,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
744 paired=run_info.paired,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
745 consensus_files=consensus_files,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
746 custom_db=bool(config.CUSTOM_DNA_DATABASE),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
747 tarean_mode=args.tarean_mode,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
748 HTML_LINKS=dict2lists(config.HTML_LINKS),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
749 pipeline_version_info=pipeline_version_info,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
750 max_memory=run_info.max_memory,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
751 max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
752 mincln=run_info.mincln
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
753 )
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
754
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
755 LOGGER.info("Html report reports created")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
756
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
757 except:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
758 r2py.shutdown(config.RSERVE_PORT)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
759 raise
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
760 finally:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
761 if args.cleanup:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
762 paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
763 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
764 LOGGER.info("copy databases to working directory")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
765 shutil.copy(paths.sequences_db, paths.working_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
766 shutil.copy(paths.hitsort_db, paths.working_dir)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
767 # copy log file inside working directory
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
768 if logfile:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
769 shutil.copyfile(logfile, paths.logfile)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
770
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
771
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
772 if __name__ == "__main__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
773 main()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
774 # some error handling here: