comparison filter_by_fasta_ids.py @ 3:3c623e81be77 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
author galaxyp
date Fri, 15 Feb 2019 16:38:31 -0500
parents 1bd985f14938
children cd22452edec2
comparison
equal deleted inserted replaced
2:1bd985f14938 3:3c623e81be77
39 sequence_parts.append(line.rstrip()) 39 sequence_parts.append(line.rstrip())
40 line = fasta_file.readline() 40 line = fasta_file.readline()
41 yield Sequence(header, sequence_parts) 41 yield Sequence(header, sequence_parts)
42 42
43 43
44 def target_match(targets, header): 44 def target_match(targets, search_entry, pattern='>([^| ]+)'):
45 ''' Matches ''' 45 ''' Matches '''
46 # Remove '>' and initial spaces from the header 46 search_entry = search_entry.upper()
47 header = header[1:].lstrip().upper() 47 m = re.search(pattern,search_entry)
48 # Search for an exact match among the targets 48 if m:
49 if header in targets: 49 target = m.group(len(m.groups()))
50 return header 50 if target in targets:
51 # Try to find an exact match for the first "word" in the header 51 return target
52 header = header.split()[0] 52 else:
53 if header in targets: 53 print( 'No ID match: %s' % search_entry, file=sys.stdout)
54 return header
55 return None 54 return None
56 55
57 56
58 def main(): 57 def main():
59 ''' the main function''' 58 ''' the main function'''
62 parser.add_argument('-i', required=True, help='Path to input FASTA file') 61 parser.add_argument('-i', required=True, help='Path to input FASTA file')
63 parser.add_argument('-o', required=True, help='Path to output FASTA file') 62 parser.add_argument('-o', required=True, help='Path to output FASTA file')
64 parser.add_argument('-d', help='Path to discarded entries file') 63 parser.add_argument('-d', help='Path to discarded entries file')
65 header_criteria = parser.add_mutually_exclusive_group() 64 header_criteria = parser.add_mutually_exclusive_group()
66 header_criteria.add_argument('--id_list', help='Path to the ID list file') 65 header_criteria.add_argument('--id_list', help='Path to the ID list file')
66 parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') 67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
68 sequence_criteria = parser.add_mutually_exclusive_group() 68 sequence_criteria = parser.add_mutually_exclusive_group()
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') 69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') 70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match')
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') 71 parser.add_argument('--max_length', type=int, help='Maximum sequence length')
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') 72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
73 options = parser.parse_args() 73 options = parser.parse_args()
74 74
75
76 if options.pattern:
77 pattern = options.pattern
78 if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
79 print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
80 exit(1)
81
75 if options.min_length is not None and options.max_length is None: 82 if options.min_length is not None and options.max_length is None:
76 options.max_length = sys.maxsize 83 options.max_length = sys.maxsize
77 if options.header_regexp: 84 if options.header_regexp:
78 regexp = re.compile(options.header_regexp) 85 regexp = re.compile(options.header_regexp)
79 if options.sequence_regexp: 86 if options.sequence_regexp:
98 105
99 with open(options.o, "w") as output: 106 with open(options.o, "w") as output:
100 for entry in homd_db: 107 for entry in homd_db:
101 print_entry = True 108 print_entry = True
102 if options.id_list: 109 if options.id_list:
103 target_matched_results = target_match(targets, entry.header) 110 target_matched_results = target_match(targets, entry.header, pattern=pattern)
104 if target_matched_results: 111 if target_matched_results:
105 work_summary['found'] += 1 112 work_summary['found'] += 1
106 targets.remove(target_matched_results) 113 targets.remove(target_matched_results)
107 else: 114 else:
108 print_entry = False 115 print_entry = False
116
109 elif options.header_regexp: 117 elif options.header_regexp:
110 if regexp.search(entry.header) is None: 118 if regexp.search(entry.header) is None:
111 print_entry = False 119 print_entry = False
112 if options.min_length is not None: 120 if options.min_length is not None:
113 sequence_length = len(entry.sequence) 121 sequence_length = len(entry.sequence)