Mercurial > repos > petr-novak > repeatrxplorer
diff seqclust @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqclust Thu Dec 19 10:24:45 2019 -0500 @@ -0,0 +1,774 @@ +#!/usr/bin/env python3 +''' TAndem REpeat ANalyzer ''' +import os +import sys +import shutil +import subprocess +import argparse +from argparse import RawTextHelpFormatter +import logging +import shlex +import multiprocessing +# config must be loaded before seqtools,... +import config +import re +from lib import seqtools, graphtools, utils, assembly_tools +from lib import r2py + +REQUIRED_VERSION = (3, 4) +if sys.version_info < REQUIRED_VERSION: + raise Exception("\n\npython 3.4 or higher is required!\n") + +# append path to louvain clustering and other binaries +os.environ['PATH'] = "{}:{}:{}".format(config.BINARIES, config.LOUVAIN, + os.environ['PATH']) + +LOGGER = logging.getLogger(__name__) + + +def get_version(path, tarean_mode): + try: + branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD", + shell=True, + cwd=path).decode('ascii').strip() + shorthash = subprocess.check_output( + "git log --pretty=format:'%h' -n 1 ", + shell=True, + cwd=path).decode('ascii').strip() + revcount = len(subprocess.check_output( + "git log --oneline", shell=True, + cwd=path).decode('ascii').split()) + try: + tag = subprocess.check_output("git describe --tags --abbrev=0", + cwd=path, + shell=True).decode('ascii').strip() + except subprocess.CalledProcessError: + tag = " " + + version_string = ( + "-------------------------------------" + "-------------------------------------\n" + "PIPELINE VERSION : " + "{branch}-{tag}-{revcount}({shorthash})\n\n" + "PROTEIN DATABASE VERSION : {PD}\n" + " md5 checksum : {PDmd5}\n\n" + "DNA DATABASE VERSION : {DD}\n" + " md5 checksum : {DDmd5}\n" + "-------------------------------------" + "-------------------------------------\n").format( + branch=branch, + shorthash=shorthash, + revcount=revcount, + tag=tag, + PD=os.path.basename(config.PROTEIN_DATABASE), + PDmd5=utils.md5checksum(config.PROTEIN_DATABASE + ".psq", fail_if_missing = not tarean_mode), + DD=os.path.basename(config.DNA_DATABASE), + DDmd5=utils.md5checksum(config.DNA_DATABASE + ".nsq")) + + except subprocess.CalledProcessError: + version_string = "version of pipeline not available!" + LOGGER.info(version_string) + return version_string + + +def valid_database(database_file): + with open(database_file, 'r', encoding='ascii') as f: + for i in f: + if i[0] == ">": + if not re.match(">.+#.+/*", i): + # TODO - make edits to correct fomating of custom database??? + return False + return True + + +def add_databases(databases, custom_databases_dir, dbtype='nucl'): + '''custom databases are copied to directory tree and blast + database is created using makeblastdb + ''' + + databases_ok = [] + print(databases) + for db_path, db_name in databases: + db_destination = "{}/{}".format(custom_databases_dir, db_name) + shutil.copyfile(db_path, db_destination) + if not valid_database(db_destination): + raise ValueError(( + "\n------------------------------------------------------------\n" + "Custom database is not valid!\n" + "Custom database of repeats are DNA sequences in fasta format.\n" + "The required format for IDs in a custom library is : \n" + " '>reapeatname#class/subclass'\n" + "Reformat the database and try again!\n" + "-------------------------------------------------------------\n\n" + )) + + cmd = "makeblastdb -in {0} -out {0} -dbtype {1}".format(db_destination, + dbtype) + print(cmd) + args = shlex.split(cmd) + print(args) + if subprocess.check_call(args, stderr=sys.stdout): + Warning("makeblastdb on {} failed".format(db_name)) + else: + databases_ok.append([db_destination, "custom_db_" + db_name]) + if len(databases_ok) == 0: + return None + else: + return databases_ok + + +def meminfo(): + ''' detect physical memory and memory usage''' + info = {} + required_fields = [ + 'MemTotal:', 'MemFree:', 'Cached:', 'SwapCached:', 'Buffers:' + ] + with open('/proc/meminfo', 'r') as f: + for i in f: + a = i.split() + if a[0] in required_fields: + info[a[0]] = int(a[1]) + return info + + +def dict2lists(d): + ''' convert dict to nested list + use the funsction to pass dictionary to R function + ''' + values = list(d.values()) + keys = list(d.keys()) + return [values, keys] + + +def show_object(obj): + ''' + helper function for printing all public atributes, + does not print callebme atributes e.i. methods.. + ''' + + s = "Configuration--------------->\n" + for i in dir(obj): + # do not show private + if i[:2] != "__": + value = getattr(obj, i) + if not callable(value): + s += "{} : {}\n".format(i, value) + s += "<---------------configuration\n" + return s + + +class DataInfo(): + ''' + stores information state of clustering and data + ''' + + def __init__(self, args, paths): + LOGGER.info("getting information about input sequences") + self.args = args + self.working_directory = args.output_dir + self.input_sequences = args.sequences.name + self.number_of_input_sequences = seqtools.SequenceSet.fasta_length( + self.input_sequences) + self.paired = args.paired + self.prefix_length = args.prefix_length + self.physical_memory = meminfo()['MemTotal:'] + self.edges_max = config.EMAX + # set max memory + if args.max_memory: + self.max_memory = args.max_memory + else: + self.max_memory = meminfo()["MemTotal:"] + # modify initial setup if number of sequences is low + if args.automatic_filtering: + config.NUMBER_OF_SEQUENCES_FOR_PRERUN = config.NUMBER_OF_SEQUENCES_FOR_PRERUN_WITH_FILTERING + + if self.number_of_input_sequences < config.NUMBER_OF_SEQUENCES_FOR_PRERUN: + config.NUMBER_OF_SEQUENCES_FOR_PRERUN = self.number_of_input_sequences + + # is number of input sequences sufficient + if self.number_of_input_sequences < config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES: + raise WrongInputDataError( + "provide more sequences for clustering, minumum {} is .required".format( + config.MINIMUM_NUMBER_OF_INPUT_SEQUENCES)) + # these atribudes will be set later after clustering is done + self.max_annotated_clusters = None + self.max_annotated_superclusters = None + # the atributes will be set after prerun is performed + self.prerun_ecount = None + self.prerun_ecount_corrected = None + self.sample_size = None + self.max_number_reads_for_clustering = None + self.mincln = None + self.number_of_omitted_reads = 0 + LOGGER.info("sampling sequences for prerun analysis") + sample = seqtools.SequenceSet( + source=self.input_sequences, + sample_size=config.NUMBER_OF_SEQUENCES_FOR_PRERUN, + paired=self.paired, + filename=paths.sample_db, + fasta=paths.sample_fasta, + rename=True) + sample.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb) + # preliminary clustering + self.prerun_vcount = len(sample) + # line count + self._prerun(sample, paths) + # adjust size of chunks: + if self.number_of_reads_for_clustering < config.CHUNK_SIZE * 30: + config.CHUNK_SIZE = round(self.number_of_reads_for_clustering / 40) + + def _prerun(self, sample, paths): + '''Preliminary characterization sequences using + clustering on small dataset - stored as sample ''' + sample.make_chunks(chunk_size=1000) + sample.create_hitsort(options=self.args.options) + sample_hitsort = graphtools.Graph(source=sample.hitsort, + paired=self.paired, + seqids=sample.keys()) + sample_hitsort.save_indexed_graph() + sample_hitsort.louvain_clustering(merge_threshold=0.2) + sample_hitsort.export_cls(path=paths.prerun_cls_file) + sample.annotate( + config.DNA_DATABASE, + annotation_name="dna_database", + directory=paths.prerun, + params=self.args.options.annotation_search_params.blastn) + + selected_tarean_contigs = [] + ecount_corrected = sample_hitsort.ecount + vcount_corrected = sample_hitsort.vcount + if self.args.automatic_filtering: + prerun_cluster_info = sample_hitsort.export_clusters_files_multiple( + min_size=10, + directory=paths.prerun_clusters, + sequences=sample, + tRNA_database_path=config.TRNA_DATABASE, + satellite_model_path=config.SATELLITE_MODEL) + # check of prerun contain clusters with large number of edges + # these sequences can be used for filtering + for cl in prerun_cluster_info: + print(cl.ecount, cl.vcount, sample_hitsort.ecount, + cl.tandem_rank) + + if (cl.tandem_rank in config.TANDEM_RANKS[0:2] and + cl.ecount / sample_hitsort.ecount > + config.FILTER_MIN_PROP_THRESHOLD and + cl.vcount > config.FILTER_MIN_SIZE_THRESHOLD): + selected_tarean_contigs.append(cl.tarean_contig_file) + ecount_corrected -= cl.ecount + vcount_corrected -= cl.vcount + + if selected_tarean_contigs: + with open(paths.filter_sequences_file, 'w') as out: + for fname in selected_tarean_contigs: + with open(fname, 'r') as f: + out.write(f.read()) + self.sequence_fiter = paths.filter_sequences_file + else: + self.sequence_fiter = None + + self.prerun_ecount = sample_hitsort.ecount + self.prerun_ecount_corrected = ecount_corrected + self.prerun_vcount_corrected = vcount_corrected + self.max_number_reads_for_clustering = round(( + ((self.edges_max * self.max_memory) / + self.prerun_ecount_corrected * self.prerun_vcount**2)**(0.5)) / 2) + + if self.max_number_reads_for_clustering >= self.number_of_input_sequences: + self.sample_size = 0 + else: + self.sample_size = self.max_number_reads_for_clustering + + n1 = self.sample_size if self.sample_size != 0 else self.number_of_input_sequences + n2 = self.args.sample if self.args.sample != 0 else self.number_of_input_sequences + self.number_of_reads_for_clustering = min(n1, n2) + # minlcn is set either based on mincl or value specified in config, + # whatever is higher + self.mincln = int(self.number_of_reads_for_clustering * + self.args.mincl / 100) + if self.mincln < config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER: + self.mincln = config.MINIMUM_NUMBER_OF_READS_IN_CLUSTER + + def __str__(self): + s = "Data info------------------->\n" + for i in dir(self): + # do not show private + if i[:2] != "__": + value = getattr(self, i) + if not callable(value): + s += "{} : {}\n".format(i, value) + s += "<----------------------Data info\n" + return s + + +class DataFiles(object): + ''' + stores location of data files and create directories ... + atributes are: + - individual directories + - individual files + - list of files or directories + + directories are created if does not exist + ''' + + def __init__(self, working_dir, subdirs, files): + LOGGER.info("creating directory structure") + self.working_dir = working_dir + # add and create directories paths + for i in subdirs: + d = os.path.join(self.working_dir, subdirs[i]) + os.makedirs(d, exist_ok=True) + setattr(self, i, d) + setattr(self, i + "__relative", subdirs[i]) + # add file paths + for i in files: + d = os.path.join(self.working_dir, files[i]) + setattr(self, i, d) + setattr(self, i + "__relative", files[i]) + + def __str__(self): + s = "" + for i in dir(self): + # do not show private + if i[:2] != "__": + value = getattr(self, i) + if not callable(value): + s += "{} : {}\n".format(i, value) + return s + + def as_list(self): + ''' + convert attr and vaues to list - suitable for passing values to R functions + ''' + values = list() + keys = list() + for i in dir(self): + # do not show private + if i[:2] != "__": + value = getattr(self, i) + if not callable(value): + values.append(value) + keys.append(i) + return [values, keys] + + def cleanup(self, paths): + ''' will remove unnecessary files from working directory ''' + for i in paths: + fn = getattr(self, i) + if os.path.exists(fn): + if os.path.isdir(fn): + shutil.rmtree(fn, ignore_errors=False) + else: + os.remove(fn) + + +class WrongInputDataError(Exception): + '''Custom exception for wrong input + ''' + + def __init__(self, arg): + super(WrongInputDataError, self).__init__(arg) + self.msg = arg + + +class Range(): + ''' + This class is used to check float range in argparse + ''' + + def __init__(self, start, end): + self.start = start + self.end = end + + def __eq__(self, other): + return self.start <= other <= self.end + + def __str__(self): + return "float range {}..{}".format(self.start, self.end) + + def __repr__(self): + return "float range {}..{}".format(self.start, self.end) + + +class DirectoryType(object): + ''' + this class is similar to argparse.FileType + for mode 'w' creates and check the access to the directory + for mode 'r' check the presence of the dictory and accesibility + ''' + + def __init__(self, mode='r'): + self._mode = mode + + def __call__(self, string): + if self._mode == 'w': + try: + os.makedirs(string, exist_ok=True) + except FileExistsError: + raise argparse.ArgumentTypeError( + "Cannot create directory, '{}' is a file".format(string)) + if os.access(string, os.W_OK): + return string + else: + raise argparse.ArgumentTypeError( + "Directory '{}' is not writable".format(string)) + if self._mode == 'r': + if not os.path.isdir(string): + raise argparse.ArgumentTypeError( + "'{}' is not a directory".format(string)) + if os.access(string, os.R_OK): + return string + else: + raise argparse.ArgumentTypeError( + "Directory '{}' is not readable".format(string)) + + +def get_cmdline_args(): + '''seqclust command line parser''' + + description = """RepeatExplorer: + Repetitive sequence discovery and clasification from NGS data + + """ + + # arguments parsing + parser = argparse.ArgumentParser(description=description, + formatter_class=RawTextHelpFormatter) + parser.add_argument('-p', '--paired', action='store_true', default=False) + parser.add_argument('-A', + '--automatic_filtering', + action='store_true', + default=False) + parser.add_argument( + '-t', + '--tarean_mode', + action='store_true', + default=False, + help="analyze only tandem reapeats without additional classification") + parser.add_argument('sequences', type=argparse.FileType('r')) + parser.add_argument('-l', + '--logfile', + type=argparse.FileType('w'), + default=None, + help='log file, logging goes to stdout if not defines') + parser.add_argument('-m', + '--mincl', + type=float, + choices=[Range(0.0, 100.0)], + default=0.01) + parser.add_argument( + '-M', + '--merge_threshold', + type=float, + choices=[0, Range(0.1, 1)], + default=0, + help= + "threshold for mate-pair based cluster merging, default 0 - no merging") + parser.add_argument( + '-o', + '--min_lcov', + type=float, + choices=[Range(30.0, 80.0)], + default=55, + help= + "minimal overlap coverage - relative to longer sequence length, default 55") + parser.add_argument('-c', + '--cpu', + type=int, + default=int(os.environ.get('TAREAN_CPU', 0)), + help="number of cpu to use, if 0 use max available") + parser.add_argument( + '-s', + '--sample', + type=int, + default=0, + help="use only sample of input data[by default max reads is used") + parser.add_argument( + '-P', + '--prefix_length', + type=int, + default=0, + help=("If you wish to keep part of the sequences name,\n" + " enter the number of characters which should be \n" + "kept (1-10) instead of zero. Use this setting if\n" + " you are doing comparative analysis")) + parser.add_argument('-v', + '--output_dir', + type=DirectoryType('w'), + default="clustering_results") + parser.add_argument( + '-r', + '--max_memory', + type=int, + default=int(os.environ.get('TAREAN_MAX_MEM', 0)), + help=("Maximal amount of available RAM in kB if not set\n" + "clustering tries to use whole available RAM")) + parser.add_argument( + '-d', + '--database', + default=None, + help="fasta file with database for annotation and name of database", + nargs=2, + action='append') + + parser.add_argument( + "-C", + "--cleanup", + default=False, + action="store_true", + help="remove unncessary large files from working directory") + + parser.add_argument( + "-k", + "--keep_names", + default=False, + action="store_true", + help="keep sequence names, by default sequences are renamed") + + parser.add_argument( + '-a', '--assembly_min', + default=5, type=int, + choices=[2,3,4,5], + help=('Assembly is performed on individual clusters, by default \n' + 'clusters with size less then 5 are not assembled. If you \n' + 'want need assembly of smaller cluster set *assmbly_min* \n' + 'accordingly\n') + ) + + parser.add_argument('-tax', + '--taxon', + default=config.PROTEIN_DATABASE_DEFAULT, + choices=list(config.PROTEIN_DATABASE_OPTIONS.keys()), + help="Select taxon and protein database version" + ) + + parser.add_argument( + '-opt', + '--options', + default="ILLUMINA", + choices=['ILLUMINA','ILLUMINA_DUST_OFF', 'ILLUMINA_SHORT', 'OXFORD_NANOPORE']) + + parser.add_argument( + '-D', + '--domain_search', + default="BLASTX_W3", + choices=['BLASTX_W2', 'BLASTX_W3', 'DIAMOND'], + help= + ('Detection of protein domains can be performed by either blastx or\n' + ' diamond" program. options are:\n' + ' BLASTX_W2 - blastx with word size 2 (slowest, the most sesitive)\n' + ' BLASTX_W3 - blastx with word size 3 (default)\n' + ' DIAMOND - diamond program (significantly faster, less sensitive)\n' + 'To use this option diamond program must be installed in your PATH')) + + args = parser.parse_args() + + # covert option string to namedtuple of options + args.options = getattr(config, args.options) + # set protein database + args.options = args.options._replace( + annotation_search_params= + args.options.annotation_search_params._replace(blastx=getattr( + config, args.domain_search))) + return args + + +def main(): + ''' + Perform graph based clustering + ''' + # argument parsing: + args = get_cmdline_args() + config.ARGS = args + logfile = args.logfile.name if args.logfile else None + logging.basicConfig( + filename=logfile, + format='\n%(asctime)s - %(name)s - %(levelname)s -\n%(message)s\n', + level=logging.INFO) + config.PROTEIN_DATABASE, config.CLASSIFICATION_HIERARCHY = config.PROTEIN_DATABASE_OPTIONS[ + args.taxon] + # number of CPU to use + pipeline_version_info = get_version(config.MAIN_DIR, tarean_mode = args.tarean_mode) + config.PROC = args.cpu if args.cpu != 0 else multiprocessing.cpu_count() + # TODO add kmer range specification to config - based on the technology + r2py.create_connection() + try: + reporting = r2py.R(config.RSOURCE_reporting, verbose=True) + create_annotation = r2py.R(config.RSOURCE_create_annotation, + verbose=True) + LOGGER.info(args) + paths = DataFiles(working_dir=args.output_dir, + subdirs=config.DIRECTORY_TREE, + files=config.FILES) + # files to be included in output + for src, dest in config.INCLUDE: + shutil.copy(src, os.path.join(paths.working_dir, dest)) + # geting information about data + run_info = DataInfo(args, paths) + LOGGER.info(run_info) + LOGGER.info(show_object(config)) + # load all sequences or sample + sequences = seqtools.SequenceSet( + source=run_info.input_sequences, + sample_size=run_info.number_of_reads_for_clustering, + paired=run_info.paired, + filename=paths.sequences_db, + fasta=paths.sequences_fasta, + prefix_length=run_info.prefix_length, + rename=not run_info.args.keep_names) + if run_info.sequence_fiter: + n = sequences.remove_sequences_using_filter( + run_info.sequence_fiter, + keep_proportion=config.FILTER_PROPORTION_OF_KEPT, + omitted_sequences_file=paths.filter_omitted, + kept_sequences_file=paths.filter_kept + ) + run_info.number_of_omitted_reads = n + # add custom databases if provided + if args.database: + config.CUSTOM_DNA_DATABASE = add_databases( + args.database, + custom_databases_dir=paths.custom_databases) + sequences.makeblastdb(legacy=args.options.legacy_database, lastdb=args.options.lastdb) + LOGGER.info("chunksize: {}".format(config.CHUNK_SIZE)) + sequences.make_chunks(chunk_size=config.CHUNK_SIZE) + sequences.create_hitsort(output=paths.hitsort, options=args.options) + hitsort = graphtools.Graph(filename=paths.hitsort_db, + source=paths.hitsort, + paired=run_info.paired, + seqids=sequences.keys()) + + LOGGER.info('hitsort with {} reads and {} edges loaded.'.format( + hitsort.vcount, hitsort.ecount)) + + hitsort.save_indexed_graph() + LOGGER.info('hitsort index created.') + + hitsort.louvain_clustering(merge_threshold=args.merge_threshold, + cleanup=args.cleanup) + hitsort.export_cls(path=paths.cls_file) + hitsort.adjust_cluster_size(config.FILTER_PROPORTION_OF_KEPT, + sequences.ids_kept) + sequences.annotate(config.DNA_DATABASE, + annotation_name="dna_database", + directory=paths.blastn, + params=args.options.annotation_search_params.blastn) + + if config.CUSTOM_DNA_DATABASE: + LOGGER.info('annotating with custom database') + for db, db_name in config.CUSTOM_DNA_DATABASE: + sequences.annotate( + db, + annotation_name=db_name, + directory=paths.blastn, + params=args.options.annotation_search_params.blastn) + + if not args.tarean_mode: + # additional analyses - full RE run + # this must be finished befor creating clusters_info + sequences.annotate( + config.PROTEIN_DATABASE, + annotation_name="protein_database", + directory=paths.blastx, + params=args.options.annotation_search_params.blastx) + + ## annotating using customa databasesreplace + LOGGER.info('creating cluster graphs') + clusters_info = hitsort.export_clusters_files_multiple( + min_size=run_info.mincln, + directory=paths.clusters, + sequences=sequences, + tRNA_database_path=config.TRNA_DATABASE, + satellite_model_path=config.SATELLITE_MODEL) + if not args.tarean_mode: + LOGGER.info("assembling..") + assembly_tools.assembly(sequences, + hitsort, + clusters_info, + assembly_dir=paths.assembly, + contigs_file=paths.contigs, + min_size_of_cluster_for_assembly=args.assembly_min) + + LOGGER.info("detecting LTR in assembly..") + for i in clusters_info: + i.detect_ltr(config.TRNA_DATABASE) + + run_info.max_annotated_clusters = max([i.index for i in clusters_info]) + run_info.max_annotated_superclusters = max([i.supercluster + for i in clusters_info]) + # make reports + cluster_listing = [i.listing() for i in clusters_info] + # make path relative to paths.cluster_info + utils.save_as_table(cluster_listing, paths.clusters_info) + # creates table cluster_info in hitsort database + graphtools.Cluster.add_cluster_table_to_database(cluster_listing, + paths.hitsort_db) + # export files for consensus sequences, one for each ranks + consensus_files = [] + for i in config.TANDEM_RANKS: + consensus_files.append(utils.export_tandem_consensus( + clusters_info, + path=paths.TR_consensus_fasta.format(i), + rank=i)) + + if not args.tarean_mode: + LOGGER.info("Creating report for superclusters") + create_annotation.create_all_superclusters_report( + max_supercluster=run_info.max_annotated_superclusters, + paths=paths.as_list(), + libdir=paths.libdir, + superclusters_dir=paths.superclusters, + seqdb=paths.sequences_db, + hitsortdb=paths.hitsort_db, + classification_hierarchy_file=config.CLASSIFICATION_HIERARCHY, + HTML_LINKS=dict2lists(config.HTML_LINKS)) + + LOGGER.info("Creating report for individual clusters") + for cluster in clusters_info: + create_annotation.create_cluster_report( + cluster.index, + seqdb=paths.sequences_db, + hitsortdb=paths.hitsort_db, + classification_hierarchy_file= + config.CLASSIFICATION_HIERARCHY, + HTML_LINKS=dict2lists(config.HTML_LINKS)) + + LOGGER.info("Creating main html report") + reporting.create_main_reports( + paths=paths.as_list(), + N_clustering=run_info.number_of_reads_for_clustering, + N_input=run_info.number_of_input_sequences, + N_omit=run_info.number_of_omitted_reads, + merge_threshold=args.merge_threshold, + paired=run_info.paired, + consensus_files=consensus_files, + custom_db=bool(config.CUSTOM_DNA_DATABASE), + tarean_mode=args.tarean_mode, + HTML_LINKS=dict2lists(config.HTML_LINKS), + pipeline_version_info=pipeline_version_info, + max_memory=run_info.max_memory, + max_number_reads_for_clustering=run_info.max_number_reads_for_clustering, + mincln=run_info.mincln + ) + + LOGGER.info("Html report reports created") + + except: + r2py.shutdown(config.RSERVE_PORT) + raise + finally: + if args.cleanup: + paths.cleanup(config.FILES_TO_DISCARD_AT_CLEANUP) + else: + LOGGER.info("copy databases to working directory") + shutil.copy(paths.sequences_db, paths.working_dir) + shutil.copy(paths.hitsort_db, paths.working_dir) + # copy log file inside working directory + if logfile: + shutil.copyfile(logfile, paths.logfile) + + +if __name__ == "__main__": + main() + # some error handling here: