Mercurial > repos > galaxyp > filter_by_fasta_ids
annotate filter_by_fasta_ids.py @ 5:dff7df6fcab5 draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
author | galaxyp |
---|---|
date | Wed, 15 May 2019 03:18:11 -0400 |
parents | cd22452edec2 |
children |
rev | line source |
---|---|
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
1 #!/usr/bin/env python |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
2 """ A script to build specific fasta databases """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
3 from __future__ import print_function |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
4 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
5 import argparse |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
6 import re |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
7 import sys |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
8 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
9 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
10 class Sequence(object): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
11 def __init__(self, header, sequence_parts): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
12 self.header = header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
13 self.sequence_parts = sequence_parts |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
14 self._sequence = None |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
15 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
16 @property |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
17 def sequence(self): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
18 if self._sequence is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
19 self._sequence = ''.join(self.sequence_parts) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
20 return self._sequence |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
21 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
22 def print(self, fh=sys.stdout): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
23 print(self.header, file=fh) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
24 for line in self.sequence_parts: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
25 print(line, file=fh) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
26 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
27 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
28 def FASTAReader_gen(fasta_filename): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
29 with open(fasta_filename) as fasta_file: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
30 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
31 while True: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
32 if not line: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
33 return |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
34 assert line.startswith('>'), "FASTA headers must start with >" |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
35 header = line.rstrip() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
36 sequence_parts = [] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
37 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
38 while line and line[0] != '>': |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
39 sequence_parts.append(line.rstrip()) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
40 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
41 yield Sequence(header, sequence_parts) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
42 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
43 |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
44 def target_match(targets, search_entry, pattern): |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
45 ''' Matches ''' |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
46 search_entry = search_entry.upper() |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
47 m = pattern.search(search_entry) |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
48 if m: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
49 target = m.group(len(m.groups())) |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
50 if target in targets: |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
51 return target |
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
52 else: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
53 print('No ID match: %s' % search_entry, file=sys.stdout) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
54 return None |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
55 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
56 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
57 def main(): |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
58 parser = argparse.ArgumentParser() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
59 parser.add_argument('-i', required=True, help='Path to input FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
60 parser.add_argument('-o', required=True, help='Path to output FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
61 parser.add_argument('-d', help='Path to discarded entries file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
62 header_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
63 header_criteria.add_argument('--id_list', help='Path to the ID list file') |
5
dff7df6fcab5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents:
4
diff
changeset
|
64 parser.add_argument('--pattern', help='regex search pattern for ID in FASTA entry') |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
65 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
66 sequence_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
67 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') |
5
dff7df6fcab5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents:
4
diff
changeset
|
68 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the sequence should match') |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
69 parser.add_argument('--max_length', type=int, help='Maximum sequence length') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
70 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
71 options = parser.parse_args() |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
72 |
3
3c623e81be77
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
galaxyp
parents:
2
diff
changeset
|
73 if options.pattern: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
74 if not re.match('^.*[(](?![?]:).*[)].*$', options.pattern): |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
75 sys.exit('pattern: "%s" did not include capture group "()" in regex ' % options.pattern) |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
76 pattern = re.compile(options.pattern) |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
77 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
78 if options.min_length is not None and options.max_length is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
79 options.max_length = sys.maxsize |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
80 if options.header_regexp: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
81 header_regexp = re.compile(options.header_regexp) |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
82 if options.sequence_regexp: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
83 sequence_regexp = re.compile(options.sequence_regexp) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
84 |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
85 work_summary = {'found': 0, 'discarded': 0} |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
86 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
87 if options.dedup: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
88 used_sequences = set() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
89 work_summary['duplicates'] = 0 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
90 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
91 if options.id_list: |
5
dff7df6fcab5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents:
4
diff
changeset
|
92 targets = set() |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
93 with open(options.id_list) as f_target: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
94 for line in f_target: |
5
dff7df6fcab5
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit f608f41d45664d04d3124c6ebc791bf8a566b3c5
galaxyp
parents:
4
diff
changeset
|
95 targets.add(line.strip().upper()) |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
96 work_summary['wanted'] = len(targets) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
97 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
98 homd_db = FASTAReader_gen(options.i) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
99 if options.d: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
100 discarded = open(options.d, 'w') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
101 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
102 with open(options.o, "w") as output: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
103 for entry in homd_db: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
104 print_entry = True |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
105 if options.id_list: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
106 target_matched_results = target_match(targets, entry.header, pattern) |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
107 if target_matched_results: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
108 targets.remove(target_matched_results) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
109 else: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
110 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
111 elif options.header_regexp: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
112 if header_regexp.search(entry.header) is None: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
113 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
114 if options.min_length is not None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
115 sequence_length = len(entry.sequence) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
116 if not(options.min_length <= sequence_length <= options.max_length): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
117 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
118 elif options.sequence_regexp: |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
119 if sequence_regexp.search(entry.sequence) is None: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
120 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
121 if print_entry: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
122 if options.dedup: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
123 if entry.sequence in used_sequences: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
124 work_summary['duplicates'] += 1 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
125 continue |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
126 else: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
127 used_sequences.add(entry.sequence) |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
128 work_summary['found'] += 1 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
129 entry.print(output) |
4
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
130 else: |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
131 work_summary['discarded'] += 1 |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
132 if options.d: |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
133 entry.print(discarded) |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
134 |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
135 if options.d: |
cd22452edec2
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 5e7097242e584763d3a6d86a824ee933500667af
galaxyp
parents:
3
diff
changeset
|
136 discarded.close() |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
137 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
138 for parm, count in work_summary.items(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
139 print('%s ==> %d' % (parm, count)) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
140 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
141 |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
142 if __name__ == "__main__": |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
143 main() |