comparison seqclust @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 #!/usr/bin/env python3
2 ''' TAndem REpeat ANalyzer '''
3 import os
4 import sys
5 import shutil
6 import subprocess
7 import argparse
8 from argparse import RawTextHelpFormatter
9 import logging
10 import shlex
11 import multiprocessing
12 # config must be loaded before seqtools,...
13 import config
14 import re
15 from lib import seqtools, graphtools, utils, assembly_tools
16 from lib import r2py
17
18 REQUIRED_VERSION = (3, 4)
19 if sys.version_info < REQUIRED_VERSION:
20 raise Exception("\n\npython 3.4 or higher is required!\n")
21
22 # append path to louvain clustering and other binaries
23 os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,
24 os.environ['PATH'])
25
26 LOGGER = logging.getLogger(__name__)
27
28
29 def get_version(path, tarean_mode):
30 try:
31 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
32 shell=True,
33 cwd=path).decode('ascii').strip()
34 shorthash = subprocess.check_output(
35 "git log --pretty=format:'%h' -n 1 ",
36 shell=True,
37 cwd=path).decode('ascii').strip()
38 revcount = len(subprocess.check_output(
39 "git log --oneline", shell=True,
40 cwd=path).decode('ascii').split())
41 try:
42 tag = subprocess.check_output("git describe --tags --abbrev=0",
43 cwd=path,
44 shell=True).decode('ascii').strip()
45 except subprocess.CalledProcessError:
46 tag = " "
47
48 version_string = (
49 "-------------------------------------"
50 "-------------------------------------\n"
51 "PIPELINE VERSION : "
52 "{branch}-{tag}-{revcount}({shorthash})\n\n"
53 "PROTEIN DATABASE VERSION : {PD}\n"
54 " md5 checksum : {PDmd5}\n\n"
55 "DNA DATABASE VERSION : {DD}\n"
56 " md5 checksum : {DDmd5}\n"
57 "-------------------------------------"
58 "-------------------------------------\n").format(
59 branch=branch,
60 shorthash=shorthash,
61 revcount=revcount,
62 tag=tag,
63 PD=os.path.basename(config.PROTEIN_DATABASE),
64 PDmd5=utils.md5checksum(config.PROTEIN_DATABASE + ".psq", fail_if_missing = not tarean_mode),
65 DD=os.path.basename(config.DNA_DATABASE),
66 DDmd5=utils.md5checksum(config.DNA_DATABASE + ".nsq"))
67
68 except subprocess.CalledProcessError:
69 version_string = "version of pipeline not available!"
70 LOGGER.info(version_string)
71 return version_string
72
73
74 def valid_database(database_file):
75 with open(database_file, 'r', encoding='ascii') as f:
76 for i in f:
77 if i[0] == ">":
78 if not re.match(">.+#.+/*", i):
79 # TODO - make edits to correct fomating of custom database???
80 return False
81 return True
82
83
84 def add_databases(databases, custom_databases_dir, dbtype='nucl'):
85 '''custom databases are copied to directory tree and blast
86 database is created using makeblastdb
87 '''
88
89 databases_ok = []
90 print(databases)
91 for db_path, db_name in databases:
92 db_destination = "{}/{}".format(custom_databases_dir, db_name)
93 shutil.copyfile(db_path, db_destination)
94 if not valid_database(db_destination):
95 raise ValueError((
96 "\n------------------------------------------------------------\n"
97 "Custom database is not valid!\n"
98 "Custom database of repeats are DNA sequences in fasta format.\n"
99 "The required format for IDs in a custom library is : \n"
100 " '>reapeatname#class/subclass'\n"
101 "Reformat the database and try again!\n"
102 "-------------------------------------------------------------\n\n"
103 ))
104
105 cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination,
106 dbtype)
107 print(cmd)
108 args = shlex.split(cmd)
109 print(args)
110 if subprocess.check_call(args, stderr=sys.stdout):
111 Warning("makeblastdb on {} failed".format(db_name))
112 else:
113 databases_ok.append([db_destination, "custom_db_" + db_name])
114 if len(databases_ok) == 0:
115 return None
116 else:
117 return databases_ok
118
119
120 def meminfo():
121 ''' detect physical memory and memory usage'''
122 info = {}
123 required_fields = [
124 'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:'
125 ]
126 with open('/proc/meminfo', 'r') as f:
127 for i in f:
128 a = i.split()
129 if a[0] in required_fields:
130 info[a[0]] = int(a[1])
131 return info
132
133
134 def dict2lists(d):
135 ''' convert dict to nested list
136 use the funsction to pass dictionary to R function
137 '''
138 values = list(d.values())
139 keys = list(d.keys())
140 return [values, keys]
141
142
143 def show_object(obj):
144 '''
145 helper function for printing all public atributes,
146 does not print callebme atributes e.i. methods..
147 '''
148
149 s = "Configuration--------------->\n"
150 for i in dir(obj):
151 # do not show private
152 if i[:2] != "__":
153 value = getattr(obj, i)
154 if not callable(value):
155 s += "{} : {}\n".format(i, value)
156 s += "<---------------configuration\n"
157 return s
158
159
160 class DataInfo():
161 '''
162 stores information state of clustering and data
163 '''
164
165 def __init__(self, args, paths):
166 LOGGER.info("getting information about input sequences")
167 self.args = args
168 self.working_directory = args.output_dir
169 self.input_sequences = args.sequences.name
170 self.number_of_input_sequences = seqtools.SequenceSet.fasta_length(
171 self.input_sequences)
172 self.paired = args.paired
173 self.prefix_length = args.prefix_length
174 self.physical_memory = meminfo()['MemTotal:']
175 self.edges_max = config.EMAX
176 # set max memory
177 if args.max_memory:
178 self.max_memory = args.max_memory
179 else:
180 self.max_memory = meminfo()["MemTotal:"]
181 # modify initial setup if number of sequences is low
182 if args.automatic_filtering:
183 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING
184
185 if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN:
186 config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences
187
188 # is number of input sequences sufficient
189 if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES:
190 raise WrongInputDataError(
191 "provide more sequences for clustering, minumum {} is .required".format(
192 config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES))
193 # these atribudes will be set later after clustering is done
194 self.max_annotated_clusters = None
195 self.max_annotated_superclusters = None
196 # the atributes will be set after prerun is performed
197 self.prerun_ecount = None
198 self.prerun_ecount_corrected = None
199 self.sample_size = None
200 self.max_number_reads_for_clustering = None
201 self.mincln = None
202 self.number_of_omitted_reads = 0
203 LOGGER.info("sampling sequences for prerun analysis")
204 sample = seqtools.SequenceSet(
205 source=self.input_sequences,
206 sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN,
207 paired=self.paired,
208 filename=paths.sample_db,
209 fasta=paths.sample_fasta,
210 rename=True)
211 sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
212 # preliminary clustering
213 self.prerun_vcount = len(sample)
214 # line count
215 self._prerun(sample, paths)
216 # adjust size of chunks:
217 if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30:
218 config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40)
219
220 def _prerun(self, sample, paths):
221 '''Preliminary characterization sequences using
222 clustering on small dataset - stored as sample '''
223 sample.make_chunks(chunk_size=1000)
224 sample.create_hitsort(options=self.args.options)
225 sample_hitsort = graphtools.Graph(source=sample.hitsort,
226 paired=self.paired,
227 seqids=sample.keys())
228 sample_hitsort.save_indexed_graph()
229 sample_hitsort.louvain_clustering(merge_threshold=0.2)
230 sample_hitsort.export_cls(path=paths.prerun_cls_file)
231 sample.annotate(
232 config.DNA_DATABASE,
233 annotation_name="dna_database",
234 directory=paths.prerun,
235 params=self.args.options.annotation_search_params.blastn)
236
237 selected_tarean_contigs = []
238 ecount_corrected = sample_hitsort.ecount
239 vcount_corrected = sample_hitsort.vcount
240 if self.args.automatic_filtering:
241 prerun_cluster_info = sample_hitsort.export_clusters_files_multiple(
242 min_size=10,
243 directory=paths.prerun_clusters,
244 sequences=sample,
245 tRNA_database_path=config.TRNA_DATABASE,
246 satellite_model_path=config.SATELLITE_MODEL)
247 # check of prerun contain clusters with large number of edges
248 # these sequences can be used for filtering
249 for cl in prerun_cluster_info:
250 print(cl.ecount, cl.vcount, sample_hitsort.ecount,
251 cl.tandem_rank)
252
253 if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and
254 cl.ecount / sample_hitsort.ecount >
255 config.FILTER_MIN_PROP_THRESHOLD and
256 cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD):
257 selected_tarean_contigs.append(cl.tarean_contig_file)
258 ecount_corrected -= cl.ecount
259 vcount_corrected -= cl.vcount
260
261 if selected_tarean_contigs:
262 with open(paths.filter_sequences_file, 'w') as out:
263 for fname in selected_tarean_contigs:
264 with open(fname, 'r') as f:
265 out.write(f.read())
266 self.sequence_fiter = paths.filter_sequences_file
267 else:
268 self.sequence_fiter = None
269
270 self.prerun_ecount = sample_hitsort.ecount
271 self.prerun_ecount_corrected = ecount_corrected
272 self.prerun_vcount_corrected = vcount_corrected
273 self.max_number_reads_for_clustering = round((
274 ((self.edges_max * self.max_memory) /
275 self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2)
276
277 if self.max_number_reads_for_clustering >= self.number_of_input_sequences:
278 self.sample_size = 0
279 else:
280 self.sample_size = self.max_number_reads_for_clustering
281
282 n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences
283 n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences
284 self.number_of_reads_for_clustering = min(n1, n2)
285 # minlcn is set either based on mincl or value specified in config,
286 # whatever is higher
287 self.mincln = int(self.number_of_reads_for_clustering *
288 self.args.mincl / 100)
289 if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER:
290 self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER
291
292 def __str__(self):
293 s = "Data info------------------->\n"
294 for i in dir(self):
295 # do not show private
296 if i[:2] != "__":
297 value = getattr(self, i)
298 if not callable(value):
299 s += "{} : {}\n".format(i, value)
300 s += "<----------------------Data info\n"
301 return s
302
303
304 class DataFiles(object):
305 '''
306 stores location of data files and create directories ...
307 atributes are:
308 - individual directories
309 - individual files
310 - list of files or directories
311
312 directories are created if does not exist
313 '''
314
315 def __init__(self, working_dir, subdirs, files):
316 LOGGER.info("creating directory structure")
317 self.working_dir = working_dir
318 # add and create directories paths
319 for i in subdirs:
320 d = os.path.join(self.working_dir, subdirs[i])
321 os.makedirs(d, exist_ok=True)
322 setattr(self, i, d)
323 setattr(self, i + "__relative", subdirs[i])
324 # add file paths
325 for i in files:
326 d = os.path.join(self.working_dir, files[i])
327 setattr(self, i, d)
328 setattr(self, i + "__relative", files[i])
329
330 def __str__(self):
331 s = ""
332 for i in dir(self):
333 # do not show private
334 if i[:2] != "__":
335 value = getattr(self, i)
336 if not callable(value):
337 s += "{} : {}\n".format(i, value)
338 return s
339
340 def as_list(self):
341 '''
342 convert attr and vaues to list - suitable for passing values to R functions
343 '''
344 values = list()
345 keys = list()
346 for i in dir(self):
347 # do not show private
348 if i[:2] != "__":
349 value = getattr(self, i)
350 if not callable(value):
351 values.append(value)
352 keys.append(i)
353 return [values, keys]
354
355 def cleanup(self, paths):
356 ''' will remove unnecessary files from working directory '''
357 for i in paths:
358 fn = getattr(self, i)
359 if os.path.exists(fn):
360 if os.path.isdir(fn):
361 shutil.rmtree(fn, ignore_errors=False)
362 else:
363 os.remove(fn)
364
365
366 class WrongInputDataError(Exception):
367 '''Custom exception for wrong input
368 '''
369
370 def __init__(self, arg):
371 super(WrongInputDataError, self).__init__(arg)
372 self.msg = arg
373
374
375 class Range():
376 '''
377 This class is used to check float range in argparse
378 '''
379
380 def __init__(self, start, end):
381 self.start = start
382 self.end = end
383
384 def __eq__(self, other):
385 return self.start <= other <= self.end
386
387 def __str__(self):
388 return "float range {}..{}".format(self.start, self.end)
389
390 def __repr__(self):
391 return "float range {}..{}".format(self.start, self.end)
392
393
394 class DirectoryType(object):
395 '''
396 this class is similar to argparse.FileType
397 for mode 'w' creates and check the access to the directory
398 for mode 'r' check the presence of the dictory and accesibility
399 '''
400
401 def __init__(self, mode='r'):
402 self._mode = mode
403
404 def __call__(self, string):
405 if self._mode == 'w':
406 try:
407 os.makedirs(string, exist_ok=True)
408 except FileExistsError:
409 raise argparse.ArgumentTypeError(
410 "Cannot create directory, '{}' is a file".format(string))
411 if os.access(string, os.W_OK):
412 return string
413 else:
414 raise argparse.ArgumentTypeError(
415 "Directory '{}' is not writable".format(string))
416 if self._mode == 'r':
417 if not os.path.isdir(string):
418 raise argparse.ArgumentTypeError(
419 "'{}' is not a directory".format(string))
420 if os.access(string, os.R_OK):
421 return string
422 else:
423 raise argparse.ArgumentTypeError(
424 "Directory '{}' is not readable".format(string))
425
426
427 def get_cmdline_args():
428 '''seqclust command line parser'''
429
430 description = """RepeatExplorer:
431 Repetitive sequence discovery and clasification from NGS data
432
433 """
434
435 # arguments parsing
436 parser = argparse.ArgumentParser(description=description,
437 formatter_class=RawTextHelpFormatter)
438 parser.add_argument('-p', '--paired', action='store_true', default=False)
439 parser.add_argument('-A',
440 '--automatic_filtering',
441 action='store_true',
442 default=False)
443 parser.add_argument(
444 '-t',
445 '--tarean_mode',
446 action='store_true',
447 default=False,
448 help="analyze only tandem reapeats without additional classification")
449 parser.add_argument('sequences', type=argparse.FileType('r'))
450 parser.add_argument('-l',
451 '--logfile',
452 type=argparse.FileType('w'),
453 default=None,
454 help='log file, logging goes to stdout if not defines')
455 parser.add_argument('-m',
456 '--mincl',
457 type=float,
458 choices=[Range(0.0, 100.0)],
459 default=0.01)
460 parser.add_argument(
461 '-M',
462 '--merge_threshold',
463 type=float,
464 choices=[0, Range(0.1, 1)],
465 default=0,
466 help=
467 "threshold for mate-pair based cluster merging, default 0 - no merging")
468 parser.add_argument(
469 '-o',
470 '--min_lcov',
471 type=float,
472 choices=[Range(30.0, 80.0)],
473 default=55,
474 help=
475 "minimal overlap coverage - relative to longer sequence length, default 55")
476 parser.add_argument('-c',
477 '--cpu',
478 type=int,
479 default=int(os.environ.get('TAREAN_CPU', 0)),
480 help="number of cpu to use, if 0 use max available")
481 parser.add_argument(
482 '-s',
483 '--sample',
484 type=int,
485 default=0,
486 help="use only sample of input data[by default max reads is used")
487 parser.add_argument(
488 '-P',
489 '--prefix_length',
490 type=int,
491 default=0,
492 help=("If you wish to keep part of the sequences name,\n"
493 " enter the number of characters which should be \n"
494 "kept (1-10) instead of zero. Use this setting if\n"
495 " you are doing comparative analysis"))
496 parser.add_argument('-v',
497 '--output_dir',
498 type=DirectoryType('w'),
499 default="clustering_results")
500 parser.add_argument(
501 '-r',
502 '--max_memory',
503 type=int,
504 default=int(os.environ.get('TAREAN_MAX_MEM', 0)),
505 help=("Maximal amount of available RAM in kB if not set\n"
506 "clustering tries to use whole available RAM"))
507 parser.add_argument(
508 '-d',
509 '--database',
510 default=None,
511 help="fasta file with database for annotation and name of database",
512 nargs=2,
513 action='append')
514
515 parser.add_argument(
516 "-C",
517 "--cleanup",
518 default=False,
519 action="store_true",
520 help="remove unncessary large files from working directory")
521
522 parser.add_argument(
523 "-k",
524 "--keep_names",
525 default=False,
526 action="store_true",
527 help="keep sequence names, by default sequences are renamed")
528
529 parser.add_argument(
530 '-a', '--assembly_min',
531 default=5, type=int,
532 choices=[2,3,4,5],
533 help=('Assembly is performed on individual clusters, by default \n'
534 'clusters with size less then 5 are not assembled. If you \n'
535 'want need assembly of smaller cluster set *assmbly_min* \n'
536 'accordingly\n')
537 )
538
539 parser.add_argument('-tax',
540 '--taxon',
541 default=config.PROTEIN_DATABASE_DEFAULT,
542 choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()),
543 help="Select taxon and protein database version"
544 )
545
546 parser.add_argument(
547 '-opt',
548 '--options',
549 default="ILLUMINA",
550 choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE'])
551
552 parser.add_argument(
553 '-D',
554 '--domain_search',
555 default="BLASTX_W3",
556 choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'],
557 help=
558 ('Detection of protein domains can be performed by either blastx or\n'
559 ' diamond" program. options are:\n'
560 ' BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n'
561 ' BLASTX_W3 - blastx with word size 3 (default)\n'
562 ' DIAMOND - diamond program (significantly faster, less sensitive)\n'
563 'To use this option diamond program must be installed in your PATH'))
564
565 args = parser.parse_args()
566
567 # covert option string to namedtuple of options
568 args.options = getattr(config, args.options)
569 # set protein database
570 args.options = args.options._replace(
571 annotation_search_params=
572 args.options.annotation_search_params._replace(blastx=getattr(
573 config, args.domain_search)))
574 return args
575
576
577 def main():
578 '''
579 Perform graph based clustering
580 '''
581 # argument parsing:
582 args = get_cmdline_args()
583 config.ARGS = args
584 logfile = args.logfile.name if args.logfile else None
585 logging.basicConfig(
586 filename=logfile,
587 format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n',
588 level=logging.INFO)
589 config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[
590 args.taxon]
591 # number of CPU to use
592 pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode)
593 config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count()
594 # TODO add kmer range specification to config - based on the technology
595 r2py.create_connection()
596 try:
597 reporting = r2py.R(config.RSOURCE_reporting, verbose=True)
598 create_annotation = r2py.R(config.RSOURCE_create_annotation,
599 verbose=True)
600 LOGGER.info(args)
601 paths = DataFiles(working_dir=args.output_dir,
602 subdirs=config.DIRECTORY_TREE,
603 files=config.FILES)
604 # files to be included in output
605 for src, dest in config.INCLUDE:
606 shutil.copy(src, os.path.join(paths.working_dir, dest))
607 # geting information about data
608 run_info = DataInfo(args, paths)
609 LOGGER.info(run_info)
610 LOGGER.info(show_object(config))
611 # load all sequences or sample
612 sequences = seqtools.SequenceSet(
613 source=run_info.input_sequences,
614 sample_size=run_info.number_of_reads_for_clustering,
615 paired=run_info.paired,
616 filename=paths.sequences_db,
617 fasta=paths.sequences_fasta,
618 prefix_length=run_info.prefix_length,
619 rename=not run_info.args.keep_names)
620 if run_info.sequence_fiter:
621 n = sequences.remove_sequences_using_filter(
622 run_info.sequence_fiter,
623 keep_proportion=config.FILTER_PROPORTION_OF_KEPT,
624 omitted_sequences_file=paths.filter_omitted,
625 kept_sequences_file=paths.filter_kept
626 )
627 run_info.number_of_omitted_reads = n
628 # add custom databases if provided
629 if args.database:
630 config.CUSTOM_DNA_DATABASE = add_databases(
631 args.database,
632 custom_databases_dir=paths.custom_databases)
633 sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
634 LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE))
635 sequences.make_chunks(chunk_size=config.CHUNK_SIZE)
636 sequences.create_hitsort(output=paths.hitsort, options=args.options)
637 hitsort = graphtools.Graph(filename=paths.hitsort_db,
638 source=paths.hitsort,
639 paired=run_info.paired,
640 seqids=sequences.keys())
641
642 LOGGER.info('hitsort with {} reads and {} edges loaded.'.format(
643 hitsort.vcount, hitsort.ecount))
644
645 hitsort.save_indexed_graph()
646 LOGGER.info('hitsort index created.')
647
648 hitsort.louvain_clustering(merge_threshold=args.merge_threshold,
649 cleanup=args.cleanup)
650 hitsort.export_cls(path=paths.cls_file)
651 hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT,
652 sequences.ids_kept)
653 sequences.annotate(config.DNA_DATABASE,
654 annotation_name="dna_database",
655 directory=paths.blastn,
656 params=args.options.annotation_search_params.blastn)
657
658 if config.CUSTOM_DNA_DATABASE:
659 LOGGER.info('annotating with custom database')
660 for db, db_name in config.CUSTOM_DNA_DATABASE:
661 sequences.annotate(
662 db,
663 annotation_name=db_name,
664 directory=paths.blastn,
665 params=args.options.annotation_search_params.blastn)
666
667 if not args.tarean_mode:
668 # additional analyses - full RE run
669 # this must be finished befor creating clusters_info
670 sequences.annotate(
671 config.PROTEIN_DATABASE,
672 annotation_name="protein_database",
673 directory=paths.blastx,
674 params=args.options.annotation_search_params.blastx)
675
676 ## annotating using customa databasesreplace
677 LOGGER.info('creating cluster graphs')
678 clusters_info = hitsort.export_clusters_files_multiple(
679 min_size=run_info.mincln,
680 directory=paths.clusters,
681 sequences=sequences,
682 tRNA_database_path=config.TRNA_DATABASE,
683 satellite_model_path=config.SATELLITE_MODEL)
684 if not args.tarean_mode:
685 LOGGER.info("assembling..")
686 assembly_tools.assembly(sequences,
687 hitsort,
688 clusters_info,
689 assembly_dir=paths.assembly,
690 contigs_file=paths.contigs,
691 min_size_of_cluster_for_assembly=args.assembly_min)
692
693 LOGGER.info("detecting LTR in assembly..")
694 for i in clusters_info:
695 i.detect_ltr(config.TRNA_DATABASE)
696
697 run_info.max_annotated_clusters = max([i.index for i in clusters_info])
698 run_info.max_annotated_superclusters = max([i.supercluster
699 for i in clusters_info])
700 # make reports
701 cluster_listing = [i.listing() for i in clusters_info]
702 # make path relative to paths.cluster_info
703 utils.save_as_table(cluster_listing, paths.clusters_info)
704 # creates table cluster_info in hitsort database
705 graphtools.Cluster.add_cluster_table_to_database(cluster_listing,
706 paths.hitsort_db)
707 # export files for consensus sequences, one for each ranks
708 consensus_files = []
709 for i in config.TANDEM_RANKS:
710 consensus_files.append(utils.export_tandem_consensus(
711 clusters_info,
712 path=paths.TR_consensus_fasta.format(i),
713 rank=i))
714
715 if not args.tarean_mode:
716 LOGGER.info("Creating report for superclusters")
717 create_annotation.create_all_superclusters_report(
718 max_supercluster=run_info.max_annotated_superclusters,
719 paths=paths.as_list(),
720 libdir=paths.libdir,
721 superclusters_dir=paths.superclusters,
722 seqdb=paths.sequences_db,
723 hitsortdb=paths.hitsort_db,
724 classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,
725 HTML_LINKS=dict2lists(config.HTML_LINKS))
726
727 LOGGER.info("Creating report for individual clusters")
728 for cluster in clusters_info:
729 create_annotation.create_cluster_report(
730 cluster.index,
731 seqdb=paths.sequences_db,
732 hitsortdb=paths.hitsort_db,
733 classification_hierarchy_file=
734 config.CLASSIFICATION_HIERARCHY,
735 HTML_LINKS=dict2lists(config.HTML_LINKS))
736
737 LOGGER.info("Creating main html report")
738 reporting.create_main_reports(
739 paths=paths.as_list(),
740 N_clustering=run_info.number_of_reads_for_clustering,
741 N_input=run_info.number_of_input_sequences,
742 N_omit=run_info.number_of_omitted_reads,
743 merge_threshold=args.merge_threshold,
744 paired=run_info.paired,
745 consensus_files=consensus_files,
746 custom_db=bool(config.CUSTOM_DNA_DATABASE),
747 tarean_mode=args.tarean_mode,
748 HTML_LINKS=dict2lists(config.HTML_LINKS),
749 pipeline_version_info=pipeline_version_info,
750 max_memory=run_info.max_memory,
751 max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,
752 mincln=run_info.mincln
753 )
754
755 LOGGER.info("Html report reports created")
756
757 except:
758 r2py.shutdown(config.RSERVE_PORT)
759 raise
760 finally:
761 if args.cleanup:
762 paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)
763 else:
764 LOGGER.info("copy databases to working directory")
765 shutil.copy(paths.sequences_db, paths.working_dir)
766 shutil.copy(paths.hitsort_db, paths.working_dir)
767 # copy log file inside working directory
768 if logfile:
769 shutil.copyfile(logfile, paths.logfile)
770
771
772 if __name__ == "__main__":
773 main()
774 # some error handling here: