annotate filter_by_fasta_ids.py @ 5:dff7df6fcab5 draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
author galaxyp
date Wed, 15 May 2019 03:18:11 -0400
parents cd22452edec2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
2 """ A script to build specific fasta databases """
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
3 from __future__ import print_function
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
4
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
5 import argparse
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
6 import re
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
7 import sys
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
8
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
9
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
10 class Sequence(object):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
11 def __init__(self, header, sequence_parts):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
12 self.header = header
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
13 self.sequence_parts = sequence_parts
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
14 self._sequence = None
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
15
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
16 @property
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
17 def sequence(self):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
18 if self._sequence is None:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
19 self._sequence = ''.join(self.sequence_parts)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
20 return self._sequence
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
21
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
22 def print(self, fh=sys.stdout):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
23 print(self.header, file=fh)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
24 for line in self.sequence_parts:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
25 print(line, file=fh)
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
26
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
27
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
28 def FASTAReader_gen(fasta_filename):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
29 with open(fasta_filename) as fasta_file:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
30 line = fasta_file.readline()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
31 while True:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
32 if not line:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
33 return
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
34 assert line.startswith('>'), "FASTA headers must start with >"
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
35 header = line.rstrip()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
36 sequence_parts = []
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
37 line = fasta_file.readline()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
38 while line and line[0] != '>':
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
39 sequence_parts.append(line.rstrip())
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
40 line = fasta_file.readline()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
41 yield Sequence(header, sequence_parts)
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
42
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
43
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
44 def target_match(targets, search_entry, pattern):
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
45 ''' Matches '''
3
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
46 search_entry = search_entry.upper()
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
47 m = pattern.search(search_entry)
3
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
48 if m:
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
49 target = m.group(len(m.groups()))
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
50 if target in targets:
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
51 return target
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
52 else:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
53 print('No ID match: %s' % search_entry, file=sys.stdout)
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
54 return None
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
55
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
56
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
57 def main():
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
58 parser = argparse.ArgumentParser()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
59 parser.add_argument('-i', required=True, help='Path to input FASTA file')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
60 parser.add_argument('-o', required=True, help='Path to output FASTA file')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
61 parser.add_argument('-d', help='Path to discarded entries file')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
62 header_criteria = parser.add_mutually_exclusive_group()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
63 header_criteria.add_argument('--id_list', help='Path to the ID list file')
5
dff7df6fcab5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents: 4
diff changeset
64 parser.add_argument('--pattern', help='regex search pattern for ID in FASTA entry')
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
65 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
66 sequence_criteria = parser.add_mutually_exclusive_group()
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
67 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
5
dff7df6fcab5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents: 4
diff changeset
68 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the sequence should match')
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
69 parser.add_argument('--max_length', type=int, help='Maximum sequence length')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
70 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
71 options = parser.parse_args()
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
72
3
3c623e81be77 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents: 2
diff changeset
73 if options.pattern:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
74 if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern):
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
75 sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern)
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
76 pattern = re.compile(options.pattern)
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
77
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
78 if options.min_length is not None and options.max_length is None:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
79 options.max_length = sys.maxsize
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
80 if options.header_regexp:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
81 header_regexp = re.compile(options.header_regexp)
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
82 if options.sequence_regexp:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
83 sequence_regexp = re.compile(options.sequence_regexp)
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
84
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
85 work_summary = {'found': 0, 'discarded': 0}
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
86
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
87 if options.dedup:
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
88 used_sequences = set()
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
89 work_summary['duplicates'] = 0
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
90
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
91 if options.id_list:
5
dff7df6fcab5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents: 4
diff changeset
92 targets = set()
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
93 with open(options.id_list) as f_target:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
94 for line in f_target:
5
dff7df6fcab5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents: 4
diff changeset
95 targets.add(line.strip().upper())
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
96 work_summary['wanted'] = len(targets)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
97
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
98 homd_db = FASTAReader_gen(options.i)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
99 if options.d:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
100 discarded = open(options.d, 'w')
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
101
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
102 with open(options.o, "w") as output:
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
103 for entry in homd_db:
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
104 print_entry = True
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
105 if options.id_list:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
106 target_matched_results = target_match(targets, entry.header, pattern)
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
107 if target_matched_results:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
108 targets.remove(target_matched_results)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
109 else:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
110 print_entry = False
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
111 elif options.header_regexp:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
112 if header_regexp.search(entry.header) is None:
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
113 print_entry = False
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
114 if options.min_length is not None:
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
115 sequence_length = len(entry.sequence)
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
116 if not(options.min_length <= sequence_length <= options.max_length):
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
117 print_entry = False
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
118 elif options.sequence_regexp:
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
119 if sequence_regexp.search(entry.sequence) is None:
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
120 print_entry = False
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
121 if print_entry:
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
122 if options.dedup:
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
123 if entry.sequence in used_sequences:
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
124 work_summary['duplicates'] += 1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
125 continue
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
126 else:
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
127 used_sequences.add(entry.sequence)
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
128 work_summary['found'] += 1
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
129 entry.print(output)
4
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
130 else:
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
131 work_summary['discarded'] += 1
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
132 if options.d:
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
133 entry.print(discarded)
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
134
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
135 if options.d:
cd22452edec2 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents: 3
diff changeset
136 discarded.close()
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
137
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
138 for parm, count in work_summary.items():
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
139 print('%s ==> %d' % (parm, count))
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
140
2
1bd985f14938 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents: 1
diff changeset
141
1
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
142 if __name__ == "__main__":
8d15aebf55fd planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff changeset
143 main()