Mercurial > repos > galaxyp > filter_by_fasta_ids
annotate filter_by_fasta_ids.py @ 3:3c623e81be77 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
author | galaxyp |
---|---|
date | Fri, 15 Feb 2019 16:38:31 -0500 |
parents | 1bd985f14938 |
children | cd22452edec2 |
rev | line source |
---|---|
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
1 #!/usr/bin/env python |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
2 """ A script to build specific fasta databases """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
3 from __future__ import print_function |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
4 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
5 import argparse |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
6 import re |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
7 import sys |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
8 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
9 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
10 class Sequence(object): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
11 def __init__(self, header, sequence_parts): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
12 self.header = header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
13 self.sequence_parts = sequence_parts |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
14 self._sequence = None |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
15 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
16 @property |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
17 def sequence(self): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
18 if self._sequence is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
19 self._sequence = ''.join(self.sequence_parts) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
20 return self._sequence |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
21 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
22 def print(self, fh=sys.stdout): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
23 print(self.header, file=fh) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
24 for line in self.sequence_parts: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
25 print(line, file=fh) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
26 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
27 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
28 def FASTAReader_gen(fasta_filename): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
29 with open(fasta_filename) as fasta_file: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
30 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
31 while True: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
32 if not line: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
33 return |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
34 assert line.startswith('>'), "FASTA headers must start with >" |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
35 header = line.rstrip() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
36 sequence_parts = [] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
37 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
38 while line and line[0] != '>': |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
39 sequence_parts.append(line.rstrip()) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
40 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
41 yield Sequence(header, sequence_parts) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
42 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
43 |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
44 def target_match(targets, search_entry, pattern='>([^| ]+)'): |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
45 ''' Matches ''' |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
46 search_entry = search_entry.upper() |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
47 m = re.search(pattern,search_entry) |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
48 if m: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
49 target = m.group(len(m.groups())) |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
50 if target in targets: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
51 return target |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
52 else: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
53 print( 'No ID match: %s' % search_entry, file=sys.stdout) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
54 return None |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
55 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
56 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
57 def main(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
58 ''' the main function''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
59 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
60 parser = argparse.ArgumentParser() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
61 parser.add_argument('-i', required=True, help='Path to input FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
62 parser.add_argument('-o', required=True, help='Path to output FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
63 parser.add_argument('-d', help='Path to discarded entries file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
64 header_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
65 header_criteria.add_argument('--id_list', help='Path to the ID list file') |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
66 parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
68 sequence_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
73 options = parser.parse_args() |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
74 |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
75 |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
76 if options.pattern: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
77 pattern = options.pattern |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
78 if not re.match('^.*[(](?![?]:).*[)].*$',pattern): |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
79 print('pattern: "%s" did not include capture group "()" in regex ' % pattern) |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
80 exit(1) |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
81 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
82 if options.min_length is not None and options.max_length is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
83 options.max_length = sys.maxsize |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
84 if options.header_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
85 regexp = re.compile(options.header_regexp) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
86 if options.sequence_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
87 regexp = re.compile(options.sequence_regexp) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
88 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
89 work_summary = {'found': 0} |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
90 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
91 if options.dedup: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
92 used_sequences = set() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
93 work_summary['duplicates'] = 0 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
94 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
95 if options.id_list: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
96 targets = [] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
97 with open(options.id_list) as f_target: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
98 for line in f_target.readlines(): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
99 targets.append(line.strip().upper()) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
100 work_summary['wanted'] = len(targets) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
101 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
102 homd_db = FASTAReader_gen(options.i) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
103 if options.d: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
104 discarded = open(options.d, 'w') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
105 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
106 with open(options.o, "w") as output: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
107 for entry in homd_db: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
108 print_entry = True |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
109 if options.id_list: |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
110 target_matched_results = target_match(targets, entry.header, pattern=pattern) |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
111 if target_matched_results: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
112 work_summary['found'] += 1 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
113 targets.remove(target_matched_results) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
114 else: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
115 print_entry = False |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
116 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
117 elif options.header_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
118 if regexp.search(entry.header) is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
119 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
120 if options.min_length is not None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
121 sequence_length = len(entry.sequence) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
122 if not(options.min_length <= sequence_length <= options.max_length): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
123 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
124 elif options.sequence_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
125 if regexp.search(entry.sequence) is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
126 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
127 if print_entry: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
128 if options.dedup: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
129 if entry.sequence in used_sequences: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
130 work_summary['duplicates'] += 1 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
131 continue |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
132 else: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
133 used_sequences.add(entry.sequence) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
134 entry.print(output) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
135 elif options.d: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
136 entry.print(discarded) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
137 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
138 for parm, count in work_summary.items(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
139 print('%s ==> %d' % (parm, count)) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
140 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
141 |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
142 if __name__ == "__main__": |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
143 main() |