comparison filter_by_fasta_ids.py @ 4:cd22452edec2 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
author galaxyp
date Thu, 18 Apr 2019 02:45:18 -0400
parents 3c623e81be77
children dff7df6fcab5
comparison
equal deleted inserted replaced
3:3c623e81be77 4:cd22452edec2
39 sequence_parts.append(line.rstrip()) 39 sequence_parts.append(line.rstrip())
40 line = fasta_file.readline() 40 line = fasta_file.readline()
41 yield Sequence(header, sequence_parts) 41 yield Sequence(header, sequence_parts)
42 42
43 43
44 def target_match(targets, search_entry, pattern='>([^| ]+)'): 44 def target_match(targets, search_entry, pattern):
45 ''' Matches ''' 45 ''' Matches '''
46 search_entry = search_entry.upper() 46 search_entry = search_entry.upper()
47 m = re.search(pattern,search_entry) 47 m = pattern.search(search_entry)
48 if m: 48 if m:
49 target = m.group(len(m.groups())) 49 target = m.group(len(m.groups()))
50 if target in targets: 50 if target in targets:
51 return target 51 return target
52 else: 52 else:
53 print( 'No ID match: %s' % search_entry, file=sys.stdout) 53 print('No ID match: %s' % search_entry, file=sys.stdout)
54 return None 54 return None
55 55
56 56
57 def main(): 57 def main():
58 ''' the main function'''
59
60 parser = argparse.ArgumentParser() 58 parser = argparse.ArgumentParser()
61 parser.add_argument('-i', required=True, help='Path to input FASTA file') 59 parser.add_argument('-i', required=True, help='Path to input FASTA file')
62 parser.add_argument('-o', required=True, help='Path to output FASTA file') 60 parser.add_argument('-o', required=True, help='Path to output FASTA file')
63 parser.add_argument('-d', help='Path to discarded entries file') 61 parser.add_argument('-d', help='Path to discarded entries file')
64 header_criteria = parser.add_mutually_exclusive_group() 62 header_criteria = parser.add_mutually_exclusive_group()
65 header_criteria.add_argument('--id_list', help='Path to the ID list file') 63 header_criteria.add_argument('--id_list', help='Path to the ID list file')
66 parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') 64 parser.add_argument('--pattern', help='regex earch attern for ID in FASTA entry')
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') 65 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
68 sequence_criteria = parser.add_mutually_exclusive_group() 66 sequence_criteria = parser.add_mutually_exclusive_group()
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') 67 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') 68 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match')
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') 69 parser.add_argument('--max_length', type=int, help='Maximum sequence length')
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') 70 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
73 options = parser.parse_args() 71 options = parser.parse_args()
74 72
75
76 if options.pattern: 73 if options.pattern:
77 pattern = options.pattern 74 if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern):
78 if not re.match('^.*[(](?![?]:).*[)].*$',pattern): 75 sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern)
79 print('pattern: "%s" did not include capture group "()" in regex ' % pattern) 76 pattern = re.compile(options.pattern)
80 exit(1) 77
81
82 if options.min_length is not None and options.max_length is None: 78 if options.min_length is not None and options.max_length is None:
83 options.max_length = sys.maxsize 79 options.max_length = sys.maxsize
84 if options.header_regexp: 80 if options.header_regexp:
85 regexp = re.compile(options.header_regexp) 81 header_regexp = re.compile(options.header_regexp)
86 if options.sequence_regexp: 82 if options.sequence_regexp:
87 regexp = re.compile(options.sequence_regexp) 83 sequence_regexp = re.compile(options.sequence_regexp)
88 84
89 work_summary = {'found': 0} 85 work_summary = {'found': 0, 'discarded': 0}
90 86
91 if options.dedup: 87 if options.dedup:
92 used_sequences = set() 88 used_sequences = set()
93 work_summary['duplicates'] = 0 89 work_summary['duplicates'] = 0
94 90
95 if options.id_list: 91 if options.id_list:
96 targets = [] 92 targets = []
97 with open(options.id_list) as f_target: 93 with open(options.id_list) as f_target:
98 for line in f_target.readlines(): 94 for line in f_target:
99 targets.append(line.strip().upper()) 95 targets.append(line.strip().upper())
100 work_summary['wanted'] = len(targets) 96 work_summary['wanted'] = len(targets)
101 97
102 homd_db = FASTAReader_gen(options.i) 98 homd_db = FASTAReader_gen(options.i)
103 if options.d: 99 if options.d:
105 101
106 with open(options.o, "w") as output: 102 with open(options.o, "w") as output:
107 for entry in homd_db: 103 for entry in homd_db:
108 print_entry = True 104 print_entry = True
109 if options.id_list: 105 if options.id_list:
110 target_matched_results = target_match(targets, entry.header, pattern=pattern) 106 target_matched_results = target_match(targets, entry.header, pattern)
111 if target_matched_results: 107 if target_matched_results:
112 work_summary['found'] += 1
113 targets.remove(target_matched_results) 108 targets.remove(target_matched_results)
114 else: 109 else:
115 print_entry = False 110 print_entry = False
116
117 elif options.header_regexp: 111 elif options.header_regexp:
118 if regexp.search(entry.header) is None: 112 if header_regexp.search(entry.header) is None:
119 print_entry = False 113 print_entry = False
120 if options.min_length is not None: 114 if options.min_length is not None:
121 sequence_length = len(entry.sequence) 115 sequence_length = len(entry.sequence)
122 if not(options.min_length <= sequence_length <= options.max_length): 116 if not(options.min_length <= sequence_length <= options.max_length):
123 print_entry = False 117 print_entry = False
124 elif options.sequence_regexp: 118 elif options.sequence_regexp:
125 if regexp.search(entry.sequence) is None: 119 if sequence_regexp.search(entry.sequence) is None:
126 print_entry = False 120 print_entry = False
127 if print_entry: 121 if print_entry:
128 if options.dedup: 122 if options.dedup:
129 if entry.sequence in used_sequences: 123 if entry.sequence in used_sequences:
130 work_summary['duplicates'] += 1 124 work_summary['duplicates'] += 1
131 continue 125 continue
132 else: 126 else:
133 used_sequences.add(entry.sequence) 127 used_sequences.add(entry.sequence)
128 work_summary['found'] += 1
134 entry.print(output) 129 entry.print(output)
135 elif options.d: 130 else:
136 entry.print(discarded) 131 work_summary['discarded'] += 1
132 if options.d:
133 entry.print(discarded)
134
135 if options.d:
136 discarded.close()
137 137
138 for parm, count in work_summary.items(): 138 for parm, count in work_summary.items():
139 print('%s ==> %d' % (parm, count)) 139 print('%s ==> %d' % (parm, count))
140 140
141 141