Mercurial > repos > petr-novak > repeatrxplorer

diff seqclust @ 0:1d1b9e1b2e2f draft
Uploaded
author: petr-novak
date: Thu, 19 Dec 2019 10:24:45 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seqclust	Thu Dec 19 10:24:45 2019 -0500
@@ -0,0 +1,774 @@
+#!/usr/bin/env python3
+''' TAndem REpeat ANalyzer  '''
+import os
+import sys
+import shutil
+import subprocess
+import argparse
+from argparse import RawTextHelpFormatter
+import logging
+import shlex
+import multiprocessing
+# config must be loaded before seqtools,...
+import config
+import re
+from lib import seqtools, graphtools, utils, assembly_tools
+from lib import r2py
+
+REQUIRED_VERSION = (3, 4)
+if sys.version_info < REQUIRED_VERSION:
+    raise Exception("\n\npython 3.4 or higher is required!\n")
+
+# append path to louvain clustering and other binaries
+os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN,
+                                       os.environ['PATH'])
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_version(path, tarean_mode):
+    try:
+        branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
+                                         shell=True,
+                                         cwd=path).decode('ascii').strip()
+        shorthash = subprocess.check_output(
+            "git log --pretty=format:'%h' -n 1  ",
+            shell=True,
+            cwd=path).decode('ascii').strip()
+        revcount = len(subprocess.check_output(
+            "git log --oneline", shell=True,
+            cwd=path).decode('ascii').split())
+        try:
+            tag = subprocess.check_output("git describe --tags --abbrev=0",
+                                          cwd=path,
+                                          shell=True).decode('ascii').strip()
+        except subprocess.CalledProcessError:
+            tag = " "
+
+        version_string = (
+            "-------------------------------------"
+            "-------------------------------------\n"
+            "PIPELINE VERSION         : "
+            "{branch}-{tag}-{revcount}({shorthash})\n\n"
+            "PROTEIN DATABASE VERSION : {PD}\n"
+            "            md5 checksum : {PDmd5}\n\n"
+            "DNA DATABASE VERSION     : {DD}\n"
+            "            md5 checksum : {DDmd5}\n"
+            "-------------------------------------"
+            "-------------------------------------\n").format(
+                branch=branch,
+                shorthash=shorthash,
+                revcount=revcount,
+                tag=tag,
+                PD=os.path.basename(config.PROTEIN_DATABASE),
+                PDmd5=utils.md5checksum(config.PROTEIN_DATABASE + ".psq", fail_if_missing = not tarean_mode),
+                DD=os.path.basename(config.DNA_DATABASE),
+                DDmd5=utils.md5checksum(config.DNA_DATABASE + ".nsq"))
+
+    except subprocess.CalledProcessError:
+        version_string = "version of pipeline not available!"
+    LOGGER.info(version_string)
+    return version_string
+
+
+def valid_database(database_file):
+    with open(database_file, 'r', encoding='ascii') as f:
+        for i in f:
+            if i[0] == ">":
+                if not re.match(">.+#.+/*", i):
+                    # TODO - make edits to correct fomating of custom database???
+                    return False
+    return True
+
+
+def add_databases(databases, custom_databases_dir, dbtype='nucl'):
+    '''custom databases are copied to directory tree and blast
+    database is created using makeblastdb
+    '''
+
+    databases_ok = []
+    print(databases)
+    for db_path, db_name in databases:
+        db_destination = "{}/{}".format(custom_databases_dir, db_name)
+        shutil.copyfile(db_path, db_destination)
+        if not valid_database(db_destination):
+            raise ValueError((
+                "\n------------------------------------------------------------\n"
+                "Custom database is not valid!\n"
+                "Custom database of repeats are DNA sequences in fasta format.\n"
+                "The required format for IDs in a custom library is : \n"
+                "   '>reapeatname#class/subclass'\n"
+                "Reformat the database and try again!\n"
+                "-------------------------------------------------------------\n\n"
+            ))
+
+        cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination,
+                                                                dbtype)
+        print(cmd)
+        args = shlex.split(cmd)
+        print(args)
+        if subprocess.check_call(args, stderr=sys.stdout):
+            Warning("makeblastdb on {} failed".format(db_name))
+        else:
+            databases_ok.append([db_destination, "custom_db_" + db_name])
+    if len(databases_ok) == 0:
+        return None
+    else:
+        return databases_ok
+
+
+def meminfo():
+    ''' detect physical memory and memory usage'''
+    info = {}
+    required_fields = [
+        'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:'
+    ]
+    with open('/proc/meminfo', 'r') as f:
+        for i in f:
+            a = i.split()
+            if a[0] in required_fields:
+                info[a[0]] = int(a[1])
+    return info
+
+
+def dict2lists(d):
+    ''' convert dict to nested list
+    use the funsction to pass dictionary to R function
+    '''
+    values = list(d.values())
+    keys = list(d.keys())
+    return [values, keys]
+
+
+def show_object(obj):
+    '''
+    helper function for printing all public atributes,
+    does not print callebme atributes e.i. methods..
+    '''
+
+    s = "Configuration--------------->\n"
+    for i in dir(obj):
+        # do not show private
+        if i[:2] != "__":
+            value = getattr(obj, i)
+            if not callable(value):
+                s += "{} : {}\n".format(i, value)
+    s += "<---------------configuration\n"
+    return s
+
+
+class DataInfo():
+    '''
+    stores information state of clustering and data
+    '''
+
+    def __init__(self, args, paths):
+        LOGGER.info("getting information about input sequences")
+        self.args = args
+        self.working_directory = args.output_dir
+        self.input_sequences = args.sequences.name
+        self.number_of_input_sequences = seqtools.SequenceSet.fasta_length(
+            self.input_sequences)
+        self.paired = args.paired
+        self.prefix_length = args.prefix_length
+        self.physical_memory = meminfo()['MemTotal:']
+        self.edges_max = config.EMAX
+        # set max memory
+        if args.max_memory:
+            self.max_memory = args.max_memory
+        else:
+            self.max_memory = meminfo()["MemTotal:"]
+        # modify initial setup if number of sequences is low
+        if args.automatic_filtering:
+            config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING
+
+        if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN:
+            config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences
+
+        # is number of input sequences sufficient
+        if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES:
+            raise WrongInputDataError(
+                "provide more sequences for clustering, minumum {} is .required".format(
+                    config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES))
+        # these atribudes will be set later after clustering is done
+        self.max_annotated_clusters = None
+        self.max_annotated_superclusters = None
+        # the atributes will be set after prerun is performed
+        self.prerun_ecount = None
+        self.prerun_ecount_corrected = None
+        self.sample_size = None
+        self.max_number_reads_for_clustering = None
+        self.mincln = None
+        self.number_of_omitted_reads = 0
+        LOGGER.info("sampling sequences for prerun analysis")
+        sample = seqtools.SequenceSet(
+            source=self.input_sequences,
+            sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN,
+            paired=self.paired,
+            filename=paths.sample_db,
+            fasta=paths.sample_fasta,
+            rename=True)
+        sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
+        # preliminary clustering
+        self.prerun_vcount = len(sample)
+        # line count
+        self._prerun(sample, paths)
+        # adjust size of chunks:
+        if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30:
+            config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40)
+
+    def _prerun(self, sample, paths):
+        '''Preliminary characterization sequences using
+        clustering on small dataset - stored as sample '''
+        sample.make_chunks(chunk_size=1000)
+        sample.create_hitsort(options=self.args.options)
+        sample_hitsort = graphtools.Graph(source=sample.hitsort,
+                                          paired=self.paired,
+                                          seqids=sample.keys())
+        sample_hitsort.save_indexed_graph()
+        sample_hitsort.louvain_clustering(merge_threshold=0.2)
+        sample_hitsort.export_cls(path=paths.prerun_cls_file)
+        sample.annotate(
+            config.DNA_DATABASE,
+            annotation_name="dna_database",
+            directory=paths.prerun,
+            params=self.args.options.annotation_search_params.blastn)
+
+        selected_tarean_contigs = []
+        ecount_corrected = sample_hitsort.ecount
+        vcount_corrected = sample_hitsort.vcount
+        if self.args.automatic_filtering:
+            prerun_cluster_info = sample_hitsort.export_clusters_files_multiple(
+                min_size=10,
+                directory=paths.prerun_clusters,
+                sequences=sample,
+                tRNA_database_path=config.TRNA_DATABASE,
+                satellite_model_path=config.SATELLITE_MODEL)
+            # check of prerun contain clusters with large number of edges
+            # these sequences can be used for filtering
+            for cl in prerun_cluster_info:
+                print(cl.ecount, cl.vcount, sample_hitsort.ecount,
+                      cl.tandem_rank)
+
+                if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and
+                        cl.ecount / sample_hitsort.ecount >
+                        config.FILTER_MIN_PROP_THRESHOLD and
+                        cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD):
+                    selected_tarean_contigs.append(cl.tarean_contig_file)
+                    ecount_corrected -= cl.ecount
+                    vcount_corrected -= cl.vcount
+
+        if selected_tarean_contigs:
+            with open(paths.filter_sequences_file, 'w') as out:
+                for fname in selected_tarean_contigs:
+                    with open(fname, 'r') as f:
+                        out.write(f.read())
+            self.sequence_fiter = paths.filter_sequences_file
+        else:
+            self.sequence_fiter = None
+
+        self.prerun_ecount = sample_hitsort.ecount
+        self.prerun_ecount_corrected = ecount_corrected
+        self.prerun_vcount_corrected = vcount_corrected
+        self.max_number_reads_for_clustering = round((
+            ((self.edges_max * self.max_memory) /
+             self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2)
+
+        if self.max_number_reads_for_clustering >= self.number_of_input_sequences:
+            self.sample_size = 0
+        else:
+            self.sample_size = self.max_number_reads_for_clustering
+
+        n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences
+        n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences
+        self.number_of_reads_for_clustering = min(n1, n2)
+        # minlcn is set either based on mincl or value specified in config,
+        # whatever is higher
+        self.mincln = int(self.number_of_reads_for_clustering *
+                          self.args.mincl / 100)
+        if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER:
+            self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER
+
+    def __str__(self):
+        s = "Data info------------------->\n"
+        for i in dir(self):
+            # do not show private
+            if i[:2] != "__":
+                value = getattr(self, i)
+                if not callable(value):
+                    s += "{} : {}\n".format(i, value)
+        s += "<----------------------Data info\n"
+        return s
+
+
+class DataFiles(object):
+    '''
+    stores location of data files and create directories ...
+    atributes are:
+    - individual directories
+    - individual files
+    - list of files or directories
+
+    directories are created if does not exist
+    '''
+
+    def __init__(self, working_dir, subdirs, files):
+        LOGGER.info("creating directory structure")
+        self.working_dir = working_dir
+        # add and create directories paths
+        for i in subdirs:
+            d = os.path.join(self.working_dir, subdirs[i])
+            os.makedirs(d, exist_ok=True)
+            setattr(self, i, d)
+            setattr(self, i + "__relative", subdirs[i])
+        # add file paths
+        for i in files:
+            d = os.path.join(self.working_dir, files[i])
+            setattr(self, i, d)
+            setattr(self, i + "__relative", files[i])
+
+    def __str__(self):
+        s = ""
+        for i in dir(self):
+            # do not show private
+            if i[:2] != "__":
+                value = getattr(self, i)
+                if not callable(value):
+                    s += "{} : {}\n".format(i, value)
+        return s
+
+    def as_list(self):
+        '''
+        convert attr and vaues to list - suitable for passing values to R functions
+        '''
+        values = list()
+        keys = list()
+        for i in dir(self):
+            # do not show private
+            if i[:2] != "__":
+                value = getattr(self, i)
+                if not callable(value):
+                    values.append(value)
+                    keys.append(i)
+        return [values, keys]
+
+    def cleanup(self, paths):
+        ''' will remove unnecessary files from working directory '''
+        for i in paths:
+            fn = getattr(self, i)
+            if os.path.exists(fn):
+                if os.path.isdir(fn):
+                    shutil.rmtree(fn, ignore_errors=False)
+                else:
+                    os.remove(fn)
+
+
+class WrongInputDataError(Exception):
+    '''Custom exception for wrong input
+    '''
+
+    def __init__(self, arg):
+        super(WrongInputDataError, self).__init__(arg)
+        self.msg = arg
+
+
+class Range():
+    '''
+    This class is used to check float range in argparse
+    '''
+
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def __eq__(self, other):
+        return self.start <= other <= self.end
+
+    def __str__(self):
+        return "float range {}..{}".format(self.start, self.end)
+
+    def __repr__(self):
+        return "float range {}..{}".format(self.start, self.end)
+
+
+class DirectoryType(object):
+    '''
+    this class is similar to argparse.FileType
+    for mode 'w' creates and check the access to the directory
+    for mode 'r' check the presence of the dictory and accesibility
+    '''
+
+    def __init__(self, mode='r'):
+        self._mode = mode
+
+    def __call__(self, string):
+        if self._mode == 'w':
+            try:
+                os.makedirs(string, exist_ok=True)
+            except FileExistsError:
+                raise argparse.ArgumentTypeError(
+                    "Cannot create directory, '{}' is a file".format(string))
+            if os.access(string, os.W_OK):
+                return string
+            else:
+                raise argparse.ArgumentTypeError(
+                    "Directory '{}' is not writable".format(string))
+        if self._mode == 'r':
+            if not os.path.isdir(string):
+                raise argparse.ArgumentTypeError(
+                    "'{}' is not a directory".format(string))
+            if os.access(string, os.R_OK):
+                return string
+            else:
+                raise argparse.ArgumentTypeError(
+                    "Directory '{}' is not readable".format(string))
+
+
+def get_cmdline_args():
+    '''seqclust command line parser'''
+
+    description = """RepeatExplorer:
+    Repetitive sequence discovery and clasification from NGS data
+
+    """
+
+    # arguments parsing
+    parser = argparse.ArgumentParser(description=description,
+                                     formatter_class=RawTextHelpFormatter)
+    parser.add_argument('-p', '--paired', action='store_true', default=False)
+    parser.add_argument('-A',
+                        '--automatic_filtering',
+                        action='store_true',
+                        default=False)
+    parser.add_argument(
+        '-t',
+        '--tarean_mode',
+        action='store_true',
+        default=False,
+        help="analyze only tandem reapeats without additional classification")
+    parser.add_argument('sequences', type=argparse.FileType('r'))
+    parser.add_argument('-l',
+                        '--logfile',
+                        type=argparse.FileType('w'),
+                        default=None,
+                        help='log file, logging goes to stdout if not defines')
+    parser.add_argument('-m',
+                        '--mincl',
+                        type=float,
+                        choices=[Range(0.0, 100.0)],
+                        default=0.01)
+    parser.add_argument(
+        '-M',
+        '--merge_threshold',
+        type=float,
+        choices=[0, Range(0.1, 1)],
+        default=0,
+        help=
+        "threshold for mate-pair based cluster merging, default 0 - no merging")
+    parser.add_argument(
+        '-o',
+        '--min_lcov',
+        type=float,
+        choices=[Range(30.0, 80.0)],
+        default=55,
+        help=
+        "minimal overlap coverage - relative to longer sequence length, default 55")
+    parser.add_argument('-c',
+                        '--cpu',
+                        type=int,
+                        default=int(os.environ.get('TAREAN_CPU', 0)),
+                        help="number of cpu to use, if 0 use max available")
+    parser.add_argument(
+        '-s',
+        '--sample',
+        type=int,
+        default=0,
+        help="use only sample of input data[by default max reads is used")
+    parser.add_argument(
+        '-P',
+        '--prefix_length',
+        type=int,
+        default=0,
+        help=("If you wish to keep part of the sequences name,\n"
+              " enter the number of characters which should be \n"
+              "kept (1-10) instead of zero. Use this setting if\n"
+              " you are doing comparative analysis"))
+    parser.add_argument('-v',
+                        '--output_dir',
+                        type=DirectoryType('w'),
+                        default="clustering_results")
+    parser.add_argument(
+        '-r',
+        '--max_memory',
+        type=int,
+        default=int(os.environ.get('TAREAN_MAX_MEM', 0)),
+        help=("Maximal amount of available RAM in kB if not set\n"
+              "clustering tries to use whole available RAM"))
+    parser.add_argument(
+        '-d',
+        '--database',
+        default=None,
+        help="fasta file with database for annotation and name of database",
+        nargs=2,
+        action='append')
+
+    parser.add_argument(
+        "-C",
+        "--cleanup",
+        default=False,
+        action="store_true",
+        help="remove unncessary large files from working directory")
+
+    parser.add_argument(
+        "-k",
+        "--keep_names",
+        default=False,
+        action="store_true",
+        help="keep sequence names, by default sequences are renamed")
+
+    parser.add_argument(
+        '-a', '--assembly_min',
+        default=5, type=int,
+        choices=[2,3,4,5],
+        help=('Assembly is performed on individual clusters, by default \n'
+              'clusters with size less then 5 are not assembled. If you \n'
+              'want need assembly of smaller cluster set *assmbly_min* \n'
+              'accordingly\n')
+    )
+
+    parser.add_argument('-tax',
+                        '--taxon',
+                        default=config.PROTEIN_DATABASE_DEFAULT,
+                        choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()),
+                        help="Select taxon and protein database version"
+    )
+
+    parser.add_argument(
+        '-opt',
+        '--options',
+        default="ILLUMINA",
+        choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE'])
+
+    parser.add_argument(
+        '-D',
+        '--domain_search',
+        default="BLASTX_W3",
+        choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'],
+        help=
+        ('Detection of protein domains can be performed by either blastx or\n'
+         ' diamond" program. options are:\n'
+         '  BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n'
+         '  BLASTX_W3 - blastx with word size 3 (default)\n'
+         '  DIAMOND   - diamond program (significantly faster, less sensitive)\n'
+         'To use this option diamond program must be installed in your PATH'))
+
+    args = parser.parse_args()
+
+    # covert option string to namedtuple of options
+    args.options = getattr(config, args.options)
+    # set protein database
+    args.options = args.options._replace(
+        annotation_search_params=
+        args.options.annotation_search_params._replace(blastx=getattr(
+            config, args.domain_search)))
+    return args
+
+
+def main():
+    '''
+    Perform graph based clustering
+    '''
+    # argument parsing:
+    args = get_cmdline_args()
+    config.ARGS = args
+    logfile = args.logfile.name if args.logfile else None
+    logging.basicConfig(
+        filename=logfile,
+        format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n',
+        level=logging.INFO)
+    config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[
+        args.taxon]
+    # number of CPU to use
+    pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode)
+    config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count()
+    # TODO add kmer range specification to config - based on the technology
+    r2py.create_connection()
+    try:
+        reporting = r2py.R(config.RSOURCE_reporting, verbose=True)
+        create_annotation = r2py.R(config.RSOURCE_create_annotation,
+                                   verbose=True)
+        LOGGER.info(args)
+        paths = DataFiles(working_dir=args.output_dir,
+                          subdirs=config.DIRECTORY_TREE,
+                          files=config.FILES)
+        # files to be included in output
+        for src, dest in config.INCLUDE:
+            shutil.copy(src, os.path.join(paths.working_dir, dest))
+        # geting information about data
+        run_info = DataInfo(args, paths)
+        LOGGER.info(run_info)
+        LOGGER.info(show_object(config))
+        # load all sequences or sample
+        sequences = seqtools.SequenceSet(
+            source=run_info.input_sequences,
+            sample_size=run_info.number_of_reads_for_clustering,
+            paired=run_info.paired,
+            filename=paths.sequences_db,
+            fasta=paths.sequences_fasta,
+            prefix_length=run_info.prefix_length,
+            rename=not run_info.args.keep_names)
+        if run_info.sequence_fiter:
+            n = sequences.remove_sequences_using_filter(
+                run_info.sequence_fiter,
+                keep_proportion=config.FILTER_PROPORTION_OF_KEPT,
+                omitted_sequences_file=paths.filter_omitted,
+                kept_sequences_file=paths.filter_kept
+            )
+            run_info.number_of_omitted_reads = n
+        # add custom databases if provided
+        if args.database:
+            config.CUSTOM_DNA_DATABASE = add_databases(
+                args.database,
+                custom_databases_dir=paths.custom_databases)
+        sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb)
+        LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE))
+        sequences.make_chunks(chunk_size=config.CHUNK_SIZE)
+        sequences.create_hitsort(output=paths.hitsort, options=args.options)
+        hitsort = graphtools.Graph(filename=paths.hitsort_db,
+                                   source=paths.hitsort,
+                                   paired=run_info.paired,
+                                   seqids=sequences.keys())
+
+        LOGGER.info('hitsort with {} reads and {} edges loaded.'.format(
+            hitsort.vcount, hitsort.ecount))
+
+        hitsort.save_indexed_graph()
+        LOGGER.info('hitsort index created.')
+
+        hitsort.louvain_clustering(merge_threshold=args.merge_threshold,
+                                   cleanup=args.cleanup)
+        hitsort.export_cls(path=paths.cls_file)
+        hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT,
+                                    sequences.ids_kept)
+        sequences.annotate(config.DNA_DATABASE,
+                           annotation_name="dna_database",
+                           directory=paths.blastn,
+                           params=args.options.annotation_search_params.blastn)
+
+        if config.CUSTOM_DNA_DATABASE:
+            LOGGER.info('annotating with custom database')
+            for db, db_name in config.CUSTOM_DNA_DATABASE:
+                sequences.annotate(
+                    db,
+                    annotation_name=db_name,
+                    directory=paths.blastn,
+                    params=args.options.annotation_search_params.blastn)
+
+        if not args.tarean_mode:
+            # additional analyses - full RE run
+            # this must be finished befor creating clusters_info
+            sequences.annotate(
+                config.PROTEIN_DATABASE,
+                annotation_name="protein_database",
+                directory=paths.blastx,
+                params=args.options.annotation_search_params.blastx)
+
+        ## annotating using customa databasesreplace
+        LOGGER.info('creating cluster graphs')
+        clusters_info = hitsort.export_clusters_files_multiple(
+            min_size=run_info.mincln,
+            directory=paths.clusters,
+            sequences=sequences,
+            tRNA_database_path=config.TRNA_DATABASE,
+            satellite_model_path=config.SATELLITE_MODEL)
+        if not args.tarean_mode:
+            LOGGER.info("assembling..")
+            assembly_tools.assembly(sequences,
+                                    hitsort,
+                                    clusters_info,
+                                    assembly_dir=paths.assembly,
+                                    contigs_file=paths.contigs,
+                                    min_size_of_cluster_for_assembly=args.assembly_min)
+
+            LOGGER.info("detecting LTR in assembly..")
+            for i in clusters_info:
+                i.detect_ltr(config.TRNA_DATABASE)
+
+        run_info.max_annotated_clusters = max([i.index for i in clusters_info])
+        run_info.max_annotated_superclusters = max([i.supercluster
+                                                    for i in clusters_info])
+        # make reports
+        cluster_listing = [i.listing() for i in clusters_info]
+        # make path relative to paths.cluster_info
+        utils.save_as_table(cluster_listing, paths.clusters_info)
+        # creates table cluster_info in hitsort database
+        graphtools.Cluster.add_cluster_table_to_database(cluster_listing,
+                                                         paths.hitsort_db)
+        # export files for consensus sequences, one for each ranks
+        consensus_files = []
+        for i in config.TANDEM_RANKS:
+            consensus_files.append(utils.export_tandem_consensus(
+                clusters_info,
+                path=paths.TR_consensus_fasta.format(i),
+                rank=i))
+
+        if not args.tarean_mode:
+            LOGGER.info("Creating report for superclusters")
+            create_annotation.create_all_superclusters_report(
+                max_supercluster=run_info.max_annotated_superclusters,
+                paths=paths.as_list(),
+                libdir=paths.libdir,
+                superclusters_dir=paths.superclusters,
+                seqdb=paths.sequences_db,
+                hitsortdb=paths.hitsort_db,
+                classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY,
+                HTML_LINKS=dict2lists(config.HTML_LINKS))
+
+            LOGGER.info("Creating report for individual clusters")
+            for cluster in clusters_info:
+                create_annotation.create_cluster_report(
+                    cluster.index,
+                    seqdb=paths.sequences_db,
+                    hitsortdb=paths.hitsort_db,
+                    classification_hierarchy_file=
+                    config.CLASSIFICATION_HIERARCHY,
+                    HTML_LINKS=dict2lists(config.HTML_LINKS))
+
+        LOGGER.info("Creating main html report")
+        reporting.create_main_reports(
+            paths=paths.as_list(),
+            N_clustering=run_info.number_of_reads_for_clustering,
+            N_input=run_info.number_of_input_sequences,
+            N_omit=run_info.number_of_omitted_reads,
+            merge_threshold=args.merge_threshold,
+            paired=run_info.paired,
+            consensus_files=consensus_files,
+            custom_db=bool(config.CUSTOM_DNA_DATABASE),
+            tarean_mode=args.tarean_mode,
+            HTML_LINKS=dict2lists(config.HTML_LINKS),
+            pipeline_version_info=pipeline_version_info,
+            max_memory=run_info.max_memory,
+            max_number_reads_for_clustering=run_info.max_number_reads_for_clustering,
+            mincln=run_info.mincln
+        )
+
+        LOGGER.info("Html report reports created")
+
+    except:
+        r2py.shutdown(config.RSERVE_PORT)
+        raise
+    finally:
+        if args.cleanup:
+            paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP)
+        else:
+            LOGGER.info("copy databases to working directory")
+            shutil.copy(paths.sequences_db, paths.working_dir)
+            shutil.copy(paths.hitsort_db, paths.working_dir)
+        # copy log file inside working directory
+        if logfile:
+            shutil.copyfile(logfile, paths.logfile)
+
+
+if __name__ == "__main__":
+    main()
+    # some error handling here:
author	petr-novak
date	Thu, 19 Dec 2019 10:24:45 -0500
parents
children