Mercurial > repos > petr-novak > repeatrxplorer
diff lib/seqtools.py @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/seqtools.py Thu Dec 19 10:24:45 2019 -0500 @@ -0,0 +1,824 @@ +#!/usr/bin/env python3 +import logging +logger = logging.getLogger(__name__) +import itertools +import os +import sys +import random +import sqlite3 +import subprocess +import shlex # for command line arguments split +from collections import namedtuple +from lib.utils import format_query +from lib.parallel.parallel import parallel2 as parallel +from lib.parallel.parallel import get_max_proc +REQUIRED_VERSION = (3, 4) +MAX_PRINT = 10 + +if sys.version_info < REQUIRED_VERSION: + raise Exception("\n\npython 3.4 or higher is required!\n") + +# additional functions + + + +def _hitsort_worker(query, blastdb, output, options): + + # output from blast is parsed from stdout + cmd = options.all2all_search_params.format(query = query, blastdb = blastdb) + print(cmd) + min_lcov, min_pid, min_ovl, min_scov, evalue_max = options.filtering_threshold + pair2insert = '' + signs ={'plus':'+', 'minus':'-'} + with open(output, 'w') as f: + with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p: + for i in p.stdout: + items = i.decode('utf-8').split() + if options.filter_self_hits: + if items[4] == items[0]: + continue + evalue = float(items[10]) + ovl_q = abs(float(items[2]) - float(items[3])) + 1 + ovl_h = abs(float(items[6]) - float(items[7])) + 1 + if (ovl_q >= min_ovl or ovl_h >= min_ovl) and float(items[8]) >= min_pid: + if float(items[1]) >= float(items[5]): + lcov = ovl_q * 100.0 / float(items[1]) + scov = ovl_h * 100.0 / float(items[5]) + else: + lcov = ovl_q * 100.0 / float(items[5]) + scov = ovl_h * 100.0 / float(items[1]) + # positive line: + if lcov >= min_lcov and scov >= min_scov and evalue_max > evalue: + pr = [items[0], items[4]] + # previous HSP + if pair2insert != "{0}\t{1}".format(pr[0], pr[1]): + pair2insert = "{0}\t{1}".format(pr[0], pr[1]) + if items[11] in ['plus', 'minus']: + items[11] = signs[items[11]] + f.write("{0}\t{1}\t{2}\n".format(pair2insert, items[9], "\t".join( + items[k] for k in [1, 2, 3, 5, 6, 7, 8, 10, 11]))) + +def blast_with_filter(fasta_file_filter, blastdb): + ''' + Return list of sequences query id which has similarity to filter + It uses mgblast for search + ''' + params = '-p 85 -W18 -UT -X40 -KT -JF -F "m D" -v100000000 -b100000000 -D4 -C 30 -H 30' + positive_hits = set() + min_pid = 90 + min_ovl_perc = 90 + cmd = " ".join(['mgblast', + '-i', fasta_file_filter, + '-d', blastdb, + params + ]) + with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p: + for i in p.stdout: + items = i.decode('utf-8').split() + ovl_perc = (abs(float(items[6]) - float(items[7])) + 1) / float(items[5]) * 100 + pid = float(items[8]) + if pid > min_pid and ovl_perc > min_ovl_perc: + positive_hits.add(items[4]) + return(positive_hits) + + + +# TODO test task +# predefine blast params +def blast_worker(query, blastdb, output, params): + if params['program'] in ['blastx', 'blastn']: + default_columns = ' -outfmt "6 {}"'.format(params['output_columns']) + cmd = "{} -query {} -db {} {} {}".format( + params['program'], query, blastdb, params['args'], default_columns) + elif params['program']=='diamond blastx': + # diomand have slightly different format than blastx + default_columns = ' --outfmt 6 {}'.format(params['output_columns']) + cmd = "{} -q {} -d {} {} {}".format( + params['program'], query, blastdb, params['args'], default_columns) + print(cmd) + preceeding_pair = ['', ''] + BlastValues = namedtuple("BlastValues", params['output_columns']) + with open(output, 'wb') as f: + with subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) as p: + for i in p.stdout: + items = i.decode('utf-8').split() + blast_values = BlastValues(*[f(i) for i, f in zip(items, params['column_types'])]) + #qseqid, sseqid, qlen, slen, length, ppos, bitscore = [ + # f(i) for i, f in zip(items, params['column_types'])] + if params['filter_function'](blast_values): + if preceeding_pair != [blast_values.qseqid, blast_values.sseqid]: + f.write(i) + preceeding_pair = [blast_values.qseqid, blast_values.sseqid] + + +class Sequence: + + def __init__(self, seq, name, paired=False): + # the mode os seq storage can be changed later to make it more + # memory efficient + self._seq = bytes(str(seq), "ascii") + self.name = str(name) + + @property + def seq(self): + return self._seq.decode("utf-8") + + @seq.setter + def seq(self, value): + self._seq = bytes(str(value), "ascii") + + def __str__(self): + return "{0} : {1}".format(self.name, self.seq) + + @staticmethod + def read_fasta(fasta_file_name): + ''' + generator - reads sequences from fasta file + return sequence one by one + ''' + with open(fasta_file_name, 'r') as f: + header = None + seqstr = None + for rawline in f: + line = rawline.strip() + if line == "": + continue + if line[0] == ">": + if header and seqstr: + yield Sequence(seqstr, header) + # reset + seqstr = None + header = line[1:] + elif seqstr: + Warning("sequence was not preceeded by header") + else: + header = line[1:] + else: + seqstr = line if not seqstr else seqstr + line + # skip empty lines: + if header and seqstr: + yield Sequence(seqstr, header) + return + + def write2fasta(self, file_object): + file_object.write(">{0}\n{1}\n".format(self.name, self.seq)) + + +class SequenceSet: + + def __init__(self, filename=None, source=None, sample_size=0, new=False, paired=False, prefix_length=0, rename=False, seed=123, fasta=None): + ''' + filename: path to sqlite database, if none database is stored in memory + source: path to fasta file, if none empty database is created + sample_size: use only sample of sample_size from fasta file, if 0 all is used + new: should be old database created? + paired - paired reads + prefix_length -int + rename -- use int as names + + creates SequenceSet, either empty or from fasta file, + it can be stored as dictionary or in shelve in file + only sample of all sequences can be loaded. if sample_size = 0 + all sequences are loaded + if source is give, new database is created - if filename exist - it will be deleted! + ''' + #====================================================================== + # TODO name checking + # TODO detect unique prefixes + #====================================================================== + # type of storage + random.seed(seed) + + self.source = source + self.sample_size = sample_size + self.paired = paired + self.filename = filename + self.prefix_length = prefix_length + self.prefix_codes = {} + self.rename = rename + self.fasta = fasta + self._length = None + self._original_length = None + self.blastdb = None + self.blastdb_legacy = None + self.chunks = None + self.ids_kept = None + self.annotations = [] + self.hitsort = None + if filename: + logger.debug("Creating database in file") + if os.path.isfile(filename) and (new or source): + # remove old database if exists + os.remove(filename) + # sqlite database in file + self.conn = sqlite3.connect(filename) + else: + # sqlite database in memory + logger.debug("Creating database in memory") + self.conn = sqlite3.connect(":memory:") + c = self.conn.cursor() + c.execute("create table sequences (name text, sequence text)") + + if source: + self._read_from_fasta() + c.execute("CREATE TABLE prefix_codes (prefix TEXT, N INTEGER)") + self.conn.executemany("INSERT INTO prefix_codes (prefix, N) values (?,?)", self.prefix_codes.items()) + self.conn.commit() + self._commit_immediately = True + self.makeblastdb(legacy=True) + # store some attribudes in database + self._update_database() + logger.info("sequences loaded") + logger.info(self) + + def _update_database(self): + # pass all atributes - which are either float, ind and str to extra table + c = self.conn.cursor() + stored_attributes = [ + ("sample_size", "integer"), + ("_length", "integer"), + ("_original_length", "integer"), + ("paired", "integer"), + ] + columns = ", ".join(i[0] + " " + i[1] for i in stored_attributes) + try: + c.execute(( + "CREATE TABLE seqinfo ( {} )".format(columns) + )) + except sqlite3.OperationalError: + pass # table already exists + # get current values + values = tuple(getattr(self, i[0]) for i in stored_attributes) + placeholder = ", ".join(["?"] * len(values)) + c.execute("DELETE FROM seqinfo") + c.execute("INSERT INTO seqinfo VALUES ({})".format(placeholder), values) + + + def _read_from_fasta(self): + # changed will be commited after whole file is loaded - this is faster + self._commit_immediately = False + if self.fasta: + f = open(self.fasta, mode='w') + # define sampling + if self.sample_size: + logger.debug( + 'sampling sequences: {} sample size'.format(self.sample_size)) + N = self.fasta_length(self.source) + if self.paired: + s = itertools.chain( + [[i, i + 1] for i in sorted(random.sample(range(1, N, 2), int(self.sample_size / 2)))]) + + sample = itertools.chain( + [item for sublist in s for item in sublist]) + + else: + sample = itertools.chain( + sorted(random.sample(range(1, N + 1), self.sample_size))) + logger.debug("sampling unpaired reads") + else: + # no sampling - use all + sample = itertools.count(1) + + # define counter: + if self.rename: + if self.paired: + counter = itertools.count(1, 0.5) + else: + counter = itertools.count(1, 1) + else: + counter = itertools.cycle([""]) + # define pairs: + if self.paired: + pair = itertools.cycle(['f', 'r']) + else: + pair = itertools.cycle(['']) + position = 0 + + seq2write = next(sample) + + for i in Sequence.read_fasta(self.source): + position += 1 + if position == seq2write: + + # create name: + prefix = i.name[0:self.prefix_length] # could be empty "" + if self.rename: + i.name = "{0}{1}{2}".format( + prefix, + int(next(counter)), + next(pair) + ) + if self.prefix_length: + if prefix in self.prefix_codes: + self.prefix_codes[prefix] += 1 + else: + self.prefix_codes[prefix] = 1 + self[i.name] = i.seq + if self.fasta: + i.write2fasta(f) + try: + seq2write = next(sample) + except StopIteration: + # no more sequences to sample + break + + + self.conn.commit() # save changes + if self.fasta: + f.close() + self._commit_immediately = True + + def __getitem__(self, item): + # item cannot be empty!!! + c = self.conn.cursor() + c.execute("select * from sequences where name= '{}'".format(item)) + # try: + name, sequence = c.fetchone() # should be only one + # except TypeError: + # return None + return sequence + + def __setitem__(self, item, value): + c = self.conn.cursor() + c.execute( + "insert into sequences values ( '{0}', '{1}')".format(item, value)) + if self._commit_immediately: + self.conn.commit() # save changes + +# def __iter__(self): +# self.c.execute("select name from sequences") +# for i in self.c: +# yield i[0] + + def __iter__(self): + c = self.conn.cursor() + c.execute("select name from sequences") + for i in c: + yield i[0] + + +# def __iter__(self): +# for i in range(1,len(self)): +# yield i + def items(self): + c = self.conn.cursor() + c.execute("select name, sequence from sequences") + for name, seq in c: + yield Sequence(seq, name) + + def toDict(self): + c = self.conn.cursor() + c.execute("select name, sequence from sequences") + d = {} + for name, seq in c: + d[name]=seq + return(d) + + def __len__(self): + c = self.conn.cursor() + c.execute("select count(*) from sequences") + length = c.fetchone()[0] + return(length) + + def __str__(self): + out = "" + c = self.conn.cursor() + c.execute("select * from sequences limit {0}".format(MAX_PRINT)) + for n, s in c: + out = out + "{0} : {1}\n".format(n, s) + out = out + "...\n" + out = out + str(len(self)) + " sequence total\n" + return out + + def insert(self, sequence, commit=True): + self._commit_immediately = commit + self[sequence.name] = sequence.seq + self._commit_immediately = True + + def keys(self): + ''' + this will get names of sequences + ''' + c = self.conn.cursor() + c.execute('select name from sequences order by rowid') + names = [] + for i in c.fetchall(): + names.append(i[0]) + return names + + def sample(self, size, seed=123): + ''' + generator - reproducible sampling sequences + ''' + max_index = len(self) + # sample = seqtools.SequenceSet(source=info.input_sequences, ) + if self.paired: + size = int(size / 2) + step = 2 + else: + step = 1 + random.seed(seed) + c = self.conn.cursor() + for i in sorted(random.sample(range(1, max_index, step), size)): + c.execute( + "select name, sequence from sequences where rowid={}".format(i)) + name, sequence = c.fetchone() + yield Sequence(sequence, name) + if self.paired: + c.execute( + "select name, sequence from sequences where rowid={}".format(i + 1)) + name, sequence = c.fetchone() + yield Sequence(sequence, name) + + def sample2db(self, db_file=None, fasta_file_name=None, size=100, seed=123): + if (not db_file) and (not fasta_file_name): + raise IOError("no db_file nor fasta_file_name were defined") + # open files + if db_file: + db = SequenceSet(filename=db_file, source=None, new=True) + if fasta_file_name: + f = open(fasta_file_name, 'w') + # write in files + for i in self.sample(size, seed): + if db_file: + db.insert(i, commit=False) + if fasta_file_name: + i.write2fasta(f) + # close files + if fasta_file_name: + f.close() + if db_file: + db.conn.commit() + return db + + def save2fasta(self, fasta_file_name, keep=True, subset=None): + if subset: + with open(fasta_file_name, 'w') as f: + c = self.conn.cursor() + c.execute("select name, sequence from sequences where name in ('{}')".format( + "','".join(subset))) + for name, sequence in c: + s = Sequence(sequence, name) + s.write2fasta(f) + else: + with open(fasta_file_name, 'w') as f: + for i in self.items(): + i.write2fasta(f) + if keep: + self.fasta = fasta_file_name + + def save_annotation(self, annotation_name, subset, dir): + annotation_file = "{}/{}_annotation.csv".format(dir, annotation_name) + with open(annotation_file, "w") as f: + c = self.conn.cursor() + c.execute( + "select * from {} where name in ('{}')".format( + annotation_name, "','".join(subset) + ) + ) + header = [description[0] for description in c.description] + f.write("\t".join(str(j) for j in header) + "\n") + for i in c: + f.write("\t".join(str(j) for j in i) + "\n") + return annotation_file + + def make_chunks(self, file_name=None, chunk_size=5000): + ''' + split SequneceSet to chunks and save to files, + return list of files names + ''' + logger.debug("creating chunks from fasta file: ") + if not file_name and self.fasta: + file_name = self.fasta + else: + raise Exception('file name for chunks is not defined!') + + seqs = iter(self.items()) + fn_chunk_names = [] + for i in itertools.count(): + try: + fn = "{0}.{1}".format(file_name, i) + logger.info("saving chunk {}".format(fn)) + for n in range(chunk_size): + s = next(seqs) + if not n: + f = open(fn, 'w') + s.write2fasta(f) + f.close() + fn_chunk_names.append(fn) + except StopIteration: + if not f.closed: + f.close() + fn_chunk_names.append(fn) + break + self.chunks = fn_chunk_names + logger.debug(self.chunks) + return(fn_chunk_names) + + def makeblastdb(self, blastdb=None, dbtype='nucl', legacy=False, lastdb = False): + ''' + dbtype eithe 'nucl' of 'prot' + ''' + logger.info("creating blast database") + # check if fasta file on disk is present + if not self.fasta: + try: + self.save2fasta(blastdb) + except TypeError: + print("\nEither blastdb or fasta must be ", + "defined when making blast database!!!\n") + raise + else: + blastdb = blastdb if blastdb else self.fasta + + cmd = "makeblastdb -in {0} -out {1} -dbtype {2}".format(self.fasta, + blastdb, + dbtype) + args = shlex.split(cmd) + if 0 == subprocess.check_call(args, stderr=sys.stdout): + self.blastdb = blastdb + else: + Warning("makeblastdb failed") + + if legacy: + logger.info("creating legacy blast database") + prot = {'nucl': 'F', 'prot': 'T'}[dbtype] + cmd = "formatdb -i {0} -p {1} -n {2}.legacy".format(self.fasta, + prot, + blastdb + ) + args = shlex.split(cmd) + if subprocess.check_call(args) == 0: + self.blastdb_legacy = "{}.legacy".format(blastdb) + else: + Warning("formatdb failed") + if lastdb: + logger.info("creating lastdb database") + cmd = "lastdb -u NEAR {fasta} {fasta}".format(fasta=self.fasta) + args = shlex.split(cmd) + if subprocess.check_call(args) == 0: + self.lastdb = self.fasta + else: + Warning("formatdb failed") + + + def create_hitsort(self, options, output=None): + ''' + query is name of files to search against blastdb of SequenceSet object + query could be multiple files + this is inteded to be used for tabular output only + output file must be specified, + if more than one query is used, multiple files are created + worker - function to execute blast search + ''' + + logger.info('running all to all blast') + query = self.chunks + if not output: + output = "{}.blast".format(self.fasta) + database = getattr(self, options.database) + if len(query) > 1: + output_parts = [output + "." + str(i) for i in range(len(query))] + else: + output_parts = [output] + query = [query] + + parallel(_hitsort_worker, query, [database], + output_parts, [options]) + logger.info('all to all blast finished') + logger.info('removing duplicates from all to all blast results') + self._clean_hitsort( + hitsort_files=output_parts, filtered_hitsort=output) + # remove temp files: + for f in output_parts: + os.unlink(f) + self.hitsort = output + return output + + def _clean_hitsort(self, hitsort_files, filtered_hitsort): + ''' alterantive hitsort duplicate removal ''' + + ids = self.keys() + Nfiles = 5 + int(sum([os.path.getsize(i) + for i in hitsort_files]) / 5e9) + hitsort_parts = [ + "{0}.{1}.parts".format(filtered_hitsort, i) for i in range(Nfiles)] + # open files + fs = [] + for i in hitsort_parts: + fs.append(open(i, 'w')) + + ids_destination = {} + fs_iter = itertools.cycle(fs) + for i in ids: + #ids_destination[i] = fs_iter.next() + ids_destination[i] = next(fs_iter) + lines_out = lines_total = 0 + temp_open = False + + for i in hitsort_files: + f = open(i, 'r') + while True: + line = f.readline() + if not line: + break + lines_total += 1 + line_items = line.split() +# ids_destination[line_items[0]].write(line) + # do not assume that it is sorted! + ids_destination[min(line_items[0:2])].write(line) + # close all files + for i in fs: + i.close() + + # load one by one and exclude duplicates: + hit_out = open(filtered_hitsort, 'w') + for i in hitsort_parts: + f = open(i, 'r') + hitsort_shelve = {} + while True: + line = f.readline() + if not line: + break + line_items = line.split() + key = "\t".join(sorted(line_items[0:2])) + value = line_items + bitscore = float(line_items[2]) + if key in hitsort_shelve: + if float(hitsort_shelve[key][2]) > bitscore: + hitsort_shelve[key] = value + hit_out.write("\t".join(hitsort_shelve[key]) + "\n") + del hitsort_shelve[key] + lines_out += 1 + else: + hitsort_shelve[key] = value + f.close() + for key in hitsort_shelve: + hit_out.write("\t".join(hitsort_shelve[key]) + "\n") + lines_out += 1 + # clean up + for i in hitsort_parts: + os.unlink(i) + hit_out.close() + return lines_total, lines_out + + def _check_database(self, database): + ''' + database is path to fastafile, database with same prefix should exist + it creates database if does not exist as temporal file! + ''' + pass + # TODO + + def remove_sequences_using_filter(self, fasta_file_filter, keep_proportion, + omitted_sequences_file, + kept_sequences_file): + ''' + Run blast with fasta_file_filter against legaci blastdb of SequenceSet to detect sequences + which are to be removed from SequencesSet. Complete pairsare removed if paired sequences + are used. + ''' + ids = blast_with_filter(fasta_file_filter, self.blastdb_legacy) + # check against database and remove sequences + + c = self.conn.cursor() + if self.paired: + c.execute("SELECT rowid FROM sequences WHERE name IN {}".format( + format_query(ids) + )) + odd_rowids = set() + for i in c.fetchall(): + if i[0] % 2 == 0: + odd_rowids.add(i[0] - 1) + else: + odd_rowids.add(i[0]) + odd_rowids_keep = random.sample(odd_rowids, round(keep_proportion * len(odd_rowids))) + c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format( + format_query(odd_rowids_keep + [i+1 for i in odd_rowids_keep]) + )) + self.ids_kept = [i[0] for i in c.fetchall()] + odd_rowids_omit = list(odd_rowids.difference(odd_rowids_keep)) + c.execute("SELECT name FROM sequences WHERE rowid IN {} ORDER BY rowid".format( + format_query(odd_rowids_omit + [i+1 for i in odd_rowids_omit]) + )) + ids_omit = [i[0] for i in c.fetchall()] + + else: + self.ids_kept = random.sample(ids, round(keep_proportion * len(ids))) + ids_omit = list(ids.difference(self.ids_kept)) + self.save2fasta(omitted_sequences_file, keep=False, subset=ids_omit) + self.save2fasta(kept_sequences_file, keep=False, subset=self.ids_kept) + self.drop_sequences(ids_omit) + # save info about omited sequences + c.execute("CREATE TABLE ids_kept (name TEXT)") + c.executemany("INSERT INTO ids_kept (name) values (?)", [(i,) for i in self.ids_kept]) + return len(ids_omit) + + def drop_sequences(self, ids_to_drop): + ''' + remove sequences from sequence set based on the ID + ''' + self._original_length = len(self) + c = self.conn.cursor() + idslist = format_query(ids_to_drop) + c.execute("CREATE TABLE omited_sequences AS SELECT * FROM sequences WHERE name IN {}".format(idslist)) + c.execute("DELETE FROM sequences WHERE name IN {}".format( + idslist + )) + # new databases must be created - with ommited sequences! + self.save2fasta(fasta_file_name=self.fasta) ## this will replace origninal file! + self._length = len(self) + self.makeblastdb(legacy=True) + self._update_database() + + def annotate(self, database, annotation_name, directory="", params=""): + ''' + database : path to blast formated database + method: type of search to use for annotation. it must be + 'blastn' of 'blastx' + Annotation is save in directory also into the table with name stored in + annotation_name variable + ''' + logger.info("annotating reads with {} ".format(annotation_name)) + self._check_database(database) + if "parallelize" in params: + parallelize = params['parallelize'] + else: + parallelize = True + if parallelize: + if not self.chunks: + self.make_chunks() + output = [ + "{}/{}_{}".format(directory, annotation_name,os.path.basename(i)) for i in self.chunks] + parallel(blast_worker, self.chunks, [database], output, [params]) + else: + # no explicit parallelization using parallel + single_output = "{}/{}_hits".format(directory, annotation_name) + params['args'] = params['args'].format(max_proc=get_max_proc()) + blast_worker(self.fasta, database, single_output,params) + output = [single_output] + + # put into as new attribute and sqlite table + c = self.conn.cursor() + c.execute("create table {} (name text, db_id text, length real, bitscore real , pid real)".format( + annotation_name)) + for blast_results in output: + with open(blast_results, 'r') as f: + for i in f: + items = i.split() + types = [str, str, float, float, float, float, float] + qseqid, sseqid, qlen, slen, length, pident, bitscore = [ + f(i) for i, f in zip(items, types)] + c.execute('insert into {} values ("{}", "{}", "{}", "{}", "{}")'.format( + annotation_name, qseqid, sseqid, length, bitscore, pident)) + self.conn.commit() + self.annotations.append(annotation_name) + + @staticmethod + def fasta_length(fasta_file_name): + ''' + count number of sequences, + ''' + with open(fasta_file_name, 'r') as f: + N = 0 + for i in f: + if i.strip()[0] == ">": + N += 1 + return N + + +# test +if __name__ == "__main__": + # get root directory + + MAIN_DIR = os.path.realpath( + os.path.dirname(os.path.realpath(__file__)) + "/..") + print("MAIN_DIR:") + print(MAIN_DIR) + TMP_DIR = "{}/tmp".format(MAIN_DIR) + TEST_DATA_DIR = "{}/test_data".format(MAIN_DIR) + + # some data: + fn = "{}/pair_reads.fasta".format(TEST_DATA_DIR) + os.chdir(TMP_DIR) + # s = SequenceSet(filename='sqlite.db') + print("number of sequences in fasta file:") + print(SequenceSet.fasta_length(fn)) + + print("loading sequences...") + s = SequenceSet(source=fn, paired=True, rename=False, + filename='sqlite.db', prefix_length=4, fasta='sample2.fasta') + print(s) + + print("creating blast database") + s.makeblastdb() + + print("creating chunks from fasta file: ") + file_list = s.make_chunks("fasta_chunks", chunk_size=500) + print(file_list) + + print('creating hitsort') + s.create_hitsort(file_list, "hitsort") + + print("saving to fasta file") + s.save2fasta('sampleX.fasta', keep=False)