Mercurial > repos > galaxyp > filter_by_fasta_ids
comparison filter_by_fasta_ids.py @ 3:3c623e81be77 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
author | galaxyp |
---|---|
date | Fri, 15 Feb 2019 16:38:31 -0500 |
parents | 1bd985f14938 |
children | cd22452edec2 |
comparison
equal
deleted
inserted
replaced
2:1bd985f14938 | 3:3c623e81be77 |
---|---|
39 sequence_parts.append(line.rstrip()) | 39 sequence_parts.append(line.rstrip()) |
40 line = fasta_file.readline() | 40 line = fasta_file.readline() |
41 yield Sequence(header, sequence_parts) | 41 yield Sequence(header, sequence_parts) |
42 | 42 |
43 | 43 |
44 def target_match(targets, header): | 44 def target_match(targets, search_entry, pattern='>([^| ]+)'): |
45 ''' Matches ''' | 45 ''' Matches ''' |
46 # Remove '>' and initial spaces from the header | 46 search_entry = search_entry.upper() |
47 header = header[1:].lstrip().upper() | 47 m = re.search(pattern,search_entry) |
48 # Search for an exact match among the targets | 48 if m: |
49 if header in targets: | 49 target = m.group(len(m.groups())) |
50 return header | 50 if target in targets: |
51 # Try to find an exact match for the first "word" in the header | 51 return target |
52 header = header.split()[0] | 52 else: |
53 if header in targets: | 53 print( 'No ID match: %s' % search_entry, file=sys.stdout) |
54 return header | |
55 return None | 54 return None |
56 | 55 |
57 | 56 |
58 def main(): | 57 def main(): |
59 ''' the main function''' | 58 ''' the main function''' |
62 parser.add_argument('-i', required=True, help='Path to input FASTA file') | 61 parser.add_argument('-i', required=True, help='Path to input FASTA file') |
63 parser.add_argument('-o', required=True, help='Path to output FASTA file') | 62 parser.add_argument('-o', required=True, help='Path to output FASTA file') |
64 parser.add_argument('-d', help='Path to discarded entries file') | 63 parser.add_argument('-d', help='Path to discarded entries file') |
65 header_criteria = parser.add_mutually_exclusive_group() | 64 header_criteria = parser.add_mutually_exclusive_group() |
66 header_criteria.add_argument('--id_list', help='Path to the ID list file') | 65 header_criteria.add_argument('--id_list', help='Path to the ID list file') |
66 parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') | |
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') | 67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') |
68 sequence_criteria = parser.add_mutually_exclusive_group() | 68 sequence_criteria = parser.add_mutually_exclusive_group() |
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') | 69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') |
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') | 70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') |
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') | 71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') |
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') | 72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
73 options = parser.parse_args() | 73 options = parser.parse_args() |
74 | 74 |
75 | |
76 if options.pattern: | |
77 pattern = options.pattern | |
78 if not re.match('^.*[(](?![?]:).*[)].*$',pattern): | |
79 print('pattern: "%s" did not include capture group "()" in regex ' % pattern) | |
80 exit(1) | |
81 | |
75 if options.min_length is not None and options.max_length is None: | 82 if options.min_length is not None and options.max_length is None: |
76 options.max_length = sys.maxsize | 83 options.max_length = sys.maxsize |
77 if options.header_regexp: | 84 if options.header_regexp: |
78 regexp = re.compile(options.header_regexp) | 85 regexp = re.compile(options.header_regexp) |
79 if options.sequence_regexp: | 86 if options.sequence_regexp: |
98 | 105 |
99 with open(options.o, "w") as output: | 106 with open(options.o, "w") as output: |
100 for entry in homd_db: | 107 for entry in homd_db: |
101 print_entry = True | 108 print_entry = True |
102 if options.id_list: | 109 if options.id_list: |
103 target_matched_results = target_match(targets, entry.header) | 110 target_matched_results = target_match(targets, entry.header, pattern=pattern) |
104 if target_matched_results: | 111 if target_matched_results: |
105 work_summary['found'] += 1 | 112 work_summary['found'] += 1 |
106 targets.remove(target_matched_results) | 113 targets.remove(target_matched_results) |
107 else: | 114 else: |
108 print_entry = False | 115 print_entry = False |
116 | |
109 elif options.header_regexp: | 117 elif options.header_regexp: |
110 if regexp.search(entry.header) is None: | 118 if regexp.search(entry.header) is None: |
111 print_entry = False | 119 print_entry = False |
112 if options.min_length is not None: | 120 if options.min_length is not None: |
113 sequence_length = len(entry.sequence) | 121 sequence_length = len(entry.sequence) |