Mercurial > repos > artbio > cherry_pick_fasta
diff cherry_pick_fasta.py @ 3:c282a8a47dd9 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit d637de6c1090314bd34bdffc2fdf979cb55b870b"
author | artbio |
---|---|
date | Fri, 21 May 2021 09:34:14 +0000 |
parents | 321cad0eb507 |
children | ba6c4aeb22ea |
line wrap: on
line diff
--- a/cherry_pick_fasta.py Tue Mar 16 23:25:57 2021 +0000 +++ b/cherry_pick_fasta.py Fri May 21 09:34:14 2021 +0000 @@ -1,51 +1,63 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" -Chery pick of fasta sequences satisfying a query string in their header/name -""" +# Chery pick of fasta sequences satisfying a query string in their header/name +import argparse -import argparse +from Bio import SeqIO def Parser(): the_parser = argparse.ArgumentParser( - description="Cherry pick fasta sequences") - the_parser.add_argument('--input', action="store", type=str, - help="input fasta file") - the_parser.add_argument('--searchfor', action="store", type=str, - help="with, without, or withlist, withoutlist") - the_parser.add_argument('--query-string', dest="query_string", - action="store", type=str, - help="headers containing the string will be \ + description='Cherry pick fasta sequences') + the_parser.add_argument('--input', action='store', type=str, + help='input fasta file') + the_parser.add_argument('--searchfor', action='store', type=str, + help='with, without, or withlist, withoutlist') + the_parser.add_argument('--mode', action='store', type=str, + default='includes', help='exact or includes') + the_parser.add_argument('--query-string', dest='query_string', + action='store', type=str, + help='headers containing the string will be \ extracted or excluded as well as the \ - corresponding sequence") - the_parser.add_argument('--query-file', dest="query_file", - action="store", type=str, - help="headers containing any of the strings provided in the \ - text file (1 string per line) will be \ - extracted or excluded as well as the \ - corresponding sequence") - - the_parser.add_argument( - '--output', action="store", type=str, help="output fasta file") + corresponding sequence') + the_parser.add_argument('--query-file', dest='query_file', + action='store', type=str, + help='headers containing any of the strings \ + provided in the text file (1 string per \ + line) will be extracted or excluded as well \ + as the corresponding sequence') + the_parser.add_argument('--output', action='store', type=str, + help='output fasta file') args = the_parser.parse_args() return args -def parse_fasta_with(query, FastaListe): +def parse_fasta_dict(query, fasta_dict, mode): if not isinstance(query, list): query = [query] accumulator = [] - for sequence in FastaListe: - for string in query: - if string in sequence: - accumulator.append(sequence) - continue - return accumulator + if mode == 'includes': + for seq_id in fasta_dict: + for string in query: + if string in seq_id: + accumulator.append(seq_id) + continue + elif mode == 'exact': + for seq_id in fasta_dict: + for string in query: + if string == seq_id: + accumulator.append(seq_id) + continue + res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} + return res_dict -def complement_fasta(fullfasta, subfasta): - return sorted(list(set(fullfasta) - set(subfasta))) +def complement_fasta_dict(fasta_dict, subfasta_dict): + fasta_ids = list(fasta_dict.keys()) + subfasta_ids = list(subfasta_dict.keys()) + complement_ids = list(set(fasta_ids) - set(subfasta_ids)) + sub_dict = {k: fasta_dict[k] for k in fasta_dict if k in complement_ids} + return sub_dict def getquerylist(file): @@ -55,37 +67,37 @@ return querylist -def __main__(): - """ main function """ - args = Parser() - searchterm = args.query_string - CrudeFasta = open(args.input, "r").read() - Output = open(args.output, "w") - FastaListe = CrudeFasta.split(">")[1:] - if args.query_string: - if args.searchfor == 'with': - contList = parse_fasta_with(searchterm, FastaListe) - contFasta = ">%s" % ">".join(contList) - Output.write(contFasta) - elif args.searchfor == 'without': - notcontList = complement_fasta(FastaListe, - parse_fasta_with(searchterm, - FastaListe)) - notcontFasta = ">%s" % ">".join(notcontList) - Output.write(notcontFasta) - if args.query_file: - searchlist = getquerylist(args.query_file) - if args.searchfor == 'with': - contList = parse_fasta_with(searchlist, FastaListe) - contFasta = ">%s" % ">".join(contList) - Output.write(contFasta) - elif args.searchfor == 'without': - notcontList = complement_fasta(FastaListe, parse_fasta_with( - searchlist, FastaListe)) - notcontFasta = ">%s" % ">".join(notcontList) - Output.write(notcontFasta) - Output.close() +def buid_fasta_dict(fasta): + seq_dict = {rec.id: rec.seq for rec in SeqIO.parse(fasta, "fasta")} + return seq_dict + + +def write_fasta_result(fasta_dict, file): + line_length = 60 + with open(file, 'w') as f: + for header in sorted(fasta_dict): + f.write('>%s\n' % header) + for i in range(line_length, len(fasta_dict[header]), line_length): + f.write('%s\n' % fasta_dict[header][i-line_length:i]) + f.write('%s\n' % fasta_dict[header][i:]) -if __name__ == "__main__": +def __main__(): + ''' main function ''' + args = Parser() + fasta_dict = buid_fasta_dict(args.input) + if args.query_string: + query = args.query_string + elif args.query_file: + query = getquerylist(args.query_file) + if args.searchfor == 'with': + fasta_result_dict = parse_fasta_dict(query, fasta_dict, args.mode) + elif args.searchfor == 'without': + fasta_result_dict = complement_fasta_dict(fasta_dict, parse_fasta_dict( + query, fasta_dict, + args.mode)) + write_fasta_result(fasta_result_dict, args.output) + + +if __name__ == '__main__': __main__()