Mercurial > repos > artbio > cherry_pick_fasta
comparison cherry_pick_fasta.py @ 3:c282a8a47dd9 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit d637de6c1090314bd34bdffc2fdf979cb55b870b"
author | artbio |
---|---|
date | Fri, 21 May 2021 09:34:14 +0000 |
parents | 321cad0eb507 |
children | ba6c4aeb22ea |
comparison
equal
deleted
inserted
replaced
2:321cad0eb507 | 3:c282a8a47dd9 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | 2 # -*- coding: utf-8 -*- |
3 """ | 3 # Chery pick of fasta sequences satisfying a query string in their header/name |
4 Chery pick of fasta sequences satisfying a query string in their header/name | 4 import argparse |
5 """ | |
6 | 5 |
7 import argparse | 6 from Bio import SeqIO |
8 | 7 |
9 | 8 |
10 def Parser(): | 9 def Parser(): |
11 the_parser = argparse.ArgumentParser( | 10 the_parser = argparse.ArgumentParser( |
12 description="Cherry pick fasta sequences") | 11 description='Cherry pick fasta sequences') |
13 the_parser.add_argument('--input', action="store", type=str, | 12 the_parser.add_argument('--input', action='store', type=str, |
14 help="input fasta file") | 13 help='input fasta file') |
15 the_parser.add_argument('--searchfor', action="store", type=str, | 14 the_parser.add_argument('--searchfor', action='store', type=str, |
16 help="with, without, or withlist, withoutlist") | 15 help='with, without, or withlist, withoutlist') |
17 the_parser.add_argument('--query-string', dest="query_string", | 16 the_parser.add_argument('--mode', action='store', type=str, |
18 action="store", type=str, | 17 default='includes', help='exact or includes') |
19 help="headers containing the string will be \ | 18 the_parser.add_argument('--query-string', dest='query_string', |
19 action='store', type=str, | |
20 help='headers containing the string will be \ | |
20 extracted or excluded as well as the \ | 21 extracted or excluded as well as the \ |
21 corresponding sequence") | 22 corresponding sequence') |
22 the_parser.add_argument('--query-file', dest="query_file", | 23 the_parser.add_argument('--query-file', dest='query_file', |
23 action="store", type=str, | 24 action='store', type=str, |
24 help="headers containing any of the strings provided in the \ | 25 help='headers containing any of the strings \ |
25 text file (1 string per line) will be \ | 26 provided in the text file (1 string per \ |
26 extracted or excluded as well as the \ | 27 line) will be extracted or excluded as well \ |
27 corresponding sequence") | 28 as the corresponding sequence') |
28 | 29 the_parser.add_argument('--output', action='store', type=str, |
29 the_parser.add_argument( | 30 help='output fasta file') |
30 '--output', action="store", type=str, help="output fasta file") | |
31 args = the_parser.parse_args() | 31 args = the_parser.parse_args() |
32 return args | 32 return args |
33 | 33 |
34 | 34 |
35 def parse_fasta_with(query, FastaListe): | 35 def parse_fasta_dict(query, fasta_dict, mode): |
36 if not isinstance(query, list): | 36 if not isinstance(query, list): |
37 query = [query] | 37 query = [query] |
38 accumulator = [] | 38 accumulator = [] |
39 for sequence in FastaListe: | 39 if mode == 'includes': |
40 for string in query: | 40 for seq_id in fasta_dict: |
41 if string in sequence: | 41 for string in query: |
42 accumulator.append(sequence) | 42 if string in seq_id: |
43 continue | 43 accumulator.append(seq_id) |
44 return accumulator | 44 continue |
45 elif mode == 'exact': | |
46 for seq_id in fasta_dict: | |
47 for string in query: | |
48 if string == seq_id: | |
49 accumulator.append(seq_id) | |
50 continue | |
51 res_dict = {k: fasta_dict[k] for k in fasta_dict if k in accumulator} | |
52 return res_dict | |
45 | 53 |
46 | 54 |
47 def complement_fasta(fullfasta, subfasta): | 55 def complement_fasta_dict(fasta_dict, subfasta_dict): |
48 return sorted(list(set(fullfasta) - set(subfasta))) | 56 fasta_ids = list(fasta_dict.keys()) |
57 subfasta_ids = list(subfasta_dict.keys()) | |
58 complement_ids = list(set(fasta_ids) - set(subfasta_ids)) | |
59 sub_dict = {k: fasta_dict[k] for k in fasta_dict if k in complement_ids} | |
60 return sub_dict | |
49 | 61 |
50 | 62 |
51 def getquerylist(file): | 63 def getquerylist(file): |
52 querylist = [] | 64 querylist = [] |
53 for line in open(file, 'r'): | 65 for line in open(file, 'r'): |
54 querylist.append(line.rstrip()) | 66 querylist.append(line.rstrip()) |
55 return querylist | 67 return querylist |
56 | 68 |
57 | 69 |
58 def __main__(): | 70 def buid_fasta_dict(fasta): |
59 """ main function """ | 71 seq_dict = {rec.id: rec.seq for rec in SeqIO.parse(fasta, "fasta")} |
60 args = Parser() | 72 return seq_dict |
61 searchterm = args.query_string | |
62 CrudeFasta = open(args.input, "r").read() | |
63 Output = open(args.output, "w") | |
64 FastaListe = CrudeFasta.split(">")[1:] | |
65 if args.query_string: | |
66 if args.searchfor == 'with': | |
67 contList = parse_fasta_with(searchterm, FastaListe) | |
68 contFasta = ">%s" % ">".join(contList) | |
69 Output.write(contFasta) | |
70 elif args.searchfor == 'without': | |
71 notcontList = complement_fasta(FastaListe, | |
72 parse_fasta_with(searchterm, | |
73 FastaListe)) | |
74 notcontFasta = ">%s" % ">".join(notcontList) | |
75 Output.write(notcontFasta) | |
76 if args.query_file: | |
77 searchlist = getquerylist(args.query_file) | |
78 if args.searchfor == 'with': | |
79 contList = parse_fasta_with(searchlist, FastaListe) | |
80 contFasta = ">%s" % ">".join(contList) | |
81 Output.write(contFasta) | |
82 elif args.searchfor == 'without': | |
83 notcontList = complement_fasta(FastaListe, parse_fasta_with( | |
84 searchlist, FastaListe)) | |
85 notcontFasta = ">%s" % ">".join(notcontList) | |
86 Output.write(notcontFasta) | |
87 Output.close() | |
88 | 73 |
89 | 74 |
90 if __name__ == "__main__": | 75 def write_fasta_result(fasta_dict, file): |
76 line_length = 60 | |
77 with open(file, 'w') as f: | |
78 for header in sorted(fasta_dict): | |
79 f.write('>%s\n' % header) | |
80 for i in range(line_length, len(fasta_dict[header]), line_length): | |
81 f.write('%s\n' % fasta_dict[header][i-line_length:i]) | |
82 f.write('%s\n' % fasta_dict[header][i:]) | |
83 | |
84 | |
85 def __main__(): | |
86 ''' main function ''' | |
87 args = Parser() | |
88 fasta_dict = buid_fasta_dict(args.input) | |
89 if args.query_string: | |
90 query = args.query_string | |
91 elif args.query_file: | |
92 query = getquerylist(args.query_file) | |
93 if args.searchfor == 'with': | |
94 fasta_result_dict = parse_fasta_dict(query, fasta_dict, args.mode) | |
95 elif args.searchfor == 'without': | |
96 fasta_result_dict = complement_fasta_dict(fasta_dict, parse_fasta_dict( | |
97 query, fasta_dict, | |
98 args.mode)) | |
99 write_fasta_result(fasta_result_dict, args.output) | |
100 | |
101 | |
102 if __name__ == '__main__': | |
91 __main__() | 103 __main__() |