Mercurial > repos > galaxyp > filter_by_fasta_ids
annotate filter_by_fasta_ids.py @ 2:1bd985f14938 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
author | galaxyp |
---|---|
date | Sat, 28 Apr 2018 03:49:28 -0400 |
parents | 8d15aebf55fd |
children | 3c623e81be77 |
rev | line source |
---|---|
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
1 #!/usr/bin/env python |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
2 """ A script to build specific fasta databases """ |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
3 from __future__ import print_function |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
4 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
5 import argparse |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
6 import re |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
7 import sys |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
8 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
9 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
10 class Sequence(object): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
11 def __init__(self, header, sequence_parts): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
12 self.header = header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
13 self.sequence_parts = sequence_parts |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
14 self._sequence = None |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
15 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
16 @property |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
17 def sequence(self): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
18 if self._sequence is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
19 self._sequence = ''.join(self.sequence_parts) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
20 return self._sequence |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
21 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
22 def print(self, fh=sys.stdout): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
23 print(self.header, file=fh) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
24 for line in self.sequence_parts: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
25 print(line, file=fh) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
26 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
27 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
28 def FASTAReader_gen(fasta_filename): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
29 with open(fasta_filename) as fasta_file: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
30 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
31 while True: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
32 if not line: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
33 return |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
34 assert line.startswith('>'), "FASTA headers must start with >" |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
35 header = line.rstrip() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
36 sequence_parts = [] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
37 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
38 while line and line[0] != '>': |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
39 sequence_parts.append(line.rstrip()) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
40 line = fasta_file.readline() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
41 yield Sequence(header, sequence_parts) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
42 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
43 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
44 def target_match(targets, header): |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
45 ''' Matches ''' |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
46 # Remove '>' and initial spaces from the header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
47 header = header[1:].lstrip().upper() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
48 # Search for an exact match among the targets |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
49 if header in targets: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
50 return header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
51 # Try to find an exact match for the first "word" in the header |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
52 header = header.split()[0] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
53 if header in targets: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
54 return header |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
55 return None |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
56 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
57 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
58 def main(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
59 ''' the main function''' |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
60 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
61 parser = argparse.ArgumentParser() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
62 parser.add_argument('-i', required=True, help='Path to input FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
63 parser.add_argument('-o', required=True, help='Path to output FASTA file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
64 parser.add_argument('-d', help='Path to discarded entries file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
65 header_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
66 header_criteria.add_argument('--id_list', help='Path to the ID list file') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
67 header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
68 sequence_criteria = parser.add_mutually_exclusive_group() |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
69 sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
70 sequence_criteria.add_argument('--sequence_regexp', help='Regular expression pattern the header should match') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
71 parser.add_argument('--max_length', type=int, help='Maximum sequence length') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
72 parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
73 options = parser.parse_args() |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
74 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
75 if options.min_length is not None and options.max_length is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
76 options.max_length = sys.maxsize |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
77 if options.header_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
78 regexp = re.compile(options.header_regexp) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
79 if options.sequence_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
80 regexp = re.compile(options.sequence_regexp) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
81 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
82 work_summary = {'found': 0} |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
83 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
84 if options.dedup: |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
85 used_sequences = set() |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
86 work_summary['duplicates'] = 0 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
87 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
88 if options.id_list: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
89 targets = [] |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
90 with open(options.id_list) as f_target: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
91 for line in f_target.readlines(): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
92 targets.append(line.strip().upper()) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
93 work_summary['wanted'] = len(targets) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
94 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
95 homd_db = FASTAReader_gen(options.i) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
96 if options.d: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
97 discarded = open(options.d, 'w') |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
98 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
99 with open(options.o, "w") as output: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
100 for entry in homd_db: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
101 print_entry = True |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
102 if options.id_list: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
103 target_matched_results = target_match(targets, entry.header) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
104 if target_matched_results: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
105 work_summary['found'] += 1 |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
106 targets.remove(target_matched_results) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
107 else: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
108 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
109 elif options.header_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
110 if regexp.search(entry.header) is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
111 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
112 if options.min_length is not None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
113 sequence_length = len(entry.sequence) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
114 if not(options.min_length <= sequence_length <= options.max_length): |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
115 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
116 elif options.sequence_regexp: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
117 if regexp.search(entry.sequence) is None: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
118 print_entry = False |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
119 if print_entry: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
120 if options.dedup: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
121 if entry.sequence in used_sequences: |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
122 work_summary['duplicates'] += 1 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
123 continue |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
124 else: |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
125 used_sequences.add(entry.sequence) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
126 entry.print(output) |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
127 elif options.d: |
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
128 entry.print(discarded) |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
129 |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
130 for parm, count in work_summary.items(): |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
131 print('%s ==> %d' % (parm, count)) |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
132 |
2
1bd985f14938
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 2bc87e917c91a3b7a43996a0f3752b8992c0c749
galaxyp
parents:
1
diff
changeset
|
133 |
1
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
134 if __name__ == "__main__": |
8d15aebf55fd
planemo upload commit 88309fbfadbafe82f2d8fb7b96468799f2421e30
galaxyp
parents:
diff
changeset
|
135 main() |