Mercurial > repos > galaxyp > translate_bed
comparison digest.py @ 0:038ecf54cbec draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/translate_bed commit 383bb485120a193bcc14f88364e51356d6ede219
| author | galaxyp |
|---|---|
| date | Mon, 22 Jan 2018 13:59:27 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:038ecf54cbec |
|---|---|
| 1 # Copyright 2012 Anton Goloborodko, Lev Levitsky | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 | |
| 15 import itertools as it | |
| 16 import re | |
| 17 from collections import deque | |
| 18 | |
| 19 | |
| 20 def cleave(sequence, rule, missed_cleavages=0, min_length=None): | |
| 21 """Cleaves a polypeptide sequence using a given rule. | |
| 22 | |
| 23 Parameters | |
| 24 ---------- | |
| 25 sequence : str | |
| 26 The sequence of a polypeptide. | |
| 27 | |
| 28 .. note:: | |
| 29 The sequence is expected to be in one-letter uppercase notation. | |
| 30 Otherwise, some of the cleavage rules in :py:data:`expasy_rules` | |
| 31 will not work as expected. | |
| 32 | |
| 33 rule : str or compiled regex | |
| 34 A regular expression describing the site of cleavage. It is recommended | |
| 35 to design the regex so that it matches only the residue whose | |
| 36 C-terminal bond is to be cleaved. All additional requirements should be | |
| 37 specified using `lookaround assertions | |
| 38 <http://www.regular-expressions.info/lookaround.html>`_. | |
| 39 :py:data:`expasy_rules` contains cleavage rules | |
| 40 for popular cleavage agents. | |
| 41 missed_cleavages : int, optional | |
| 42 Maximum number of allowed missed cleavages. Defaults to 0. | |
| 43 min_length : int or None, optional | |
| 44 Minimum peptide length. Defaults to :py:const:`None`. | |
| 45 | |
| 46 ..note :: | |
| 47 This checks for string length, which is only correct for one-letter | |
| 48 notation and not for full *modX*. Use :py:func:`length` manually if | |
| 49 you know what you are doing and apply :py:func:`cleave` to *modX* | |
| 50 sequences. | |
| 51 | |
| 52 Returns | |
| 53 ------- | |
| 54 out : set | |
| 55 A set of unique (!) peptides. | |
| 56 | |
| 57 Examples | |
| 58 -------- | |
| 59 >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} | |
| 60 True | |
| 61 >>> cleave('GKGKYKCK', expasy_rules['trypsin'], 2) == \ | |
| 62 {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} | |
| 63 True | |
| 64 | |
| 65 """ | |
| 66 return set(_cleave(sequence, rule, missed_cleavages, min_length)) | |
| 67 | |
| 68 | |
| 69 def _cleave(sequence, rule, missed_cleavages=0, min_length=None): | |
| 70 """Like :py:func:`cleave`, but the result is a list. Refer to | |
| 71 :py:func:`cleave` for explanation of parameters. | |
| 72 """ | |
| 73 peptides = [] | |
| 74 ml = missed_cleavages+2 | |
| 75 trange = range(ml) | |
| 76 cleavage_sites = deque([0], maxlen=ml) | |
| 77 cl = 1 | |
| 78 for i in it.chain([x.end() for x in re.finditer(rule, sequence)], | |
| 79 [None]): | |
| 80 cleavage_sites.append(i) | |
| 81 if cl < ml: | |
| 82 cl += 1 | |
| 83 for j in trange[:cl-1]: | |
| 84 seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] | |
| 85 if seq: | |
| 86 if min_length is None or len(seq) >= min_length: | |
| 87 peptides.append(seq) | |
| 88 return peptides | |
| 89 | |
| 90 | |
| 91 def num_sites(sequence, rule, **kwargs): | |
| 92 """Count the number of sites where `sequence` can be cleaved using | |
| 93 the given `rule` (e.g. number of miscleavages for a peptide). | |
| 94 | |
| 95 Parameters | |
| 96 ---------- | |
| 97 sequence : str | |
| 98 The sequence of a polypeptide. | |
| 99 rule : str or compiled regex | |
| 100 A regular expression describing the site of cleavage. It is recommended | |
| 101 to design the regex so that it matches only the residue whose | |
| 102 C-terminal bond is to be cleaved. All additional requirements should be | |
| 103 specified using `lookaround assertions | |
| 104 <http://www.regular-expressions.info/lookaround.html>`_. | |
| 105 labels : list, optional | |
| 106 A list of allowed labels for amino acids and terminal modifications. | |
| 107 | |
| 108 Returns | |
| 109 ------- | |
| 110 out : int | |
| 111 Number of cleavage sites. | |
| 112 """ | |
| 113 return len(_cleave(sequence, rule, **kwargs)) - 1 | |
| 114 | |
| 115 | |
| 116 expasy_rules = { | |
| 117 'arg-c': r'R', | |
| 118 'asp-n': r'\w(?=D)', | |
| 119 'bnps-skatole': r'W', | |
| 120 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])', | |
| 121 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])', | |
| 122 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])', | |
| 123 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])', | |
| 124 'caspase 5': r'(?<=[LW]EH)D', | |
| 125 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])', | |
| 126 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])', | |
| 127 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])', | |
| 128 'caspase 9': r'(?<=LEH)D', | |
| 129 'caspase 10': r'(?<=IEA)D', | |
| 130 'chymotrypsin high specificity': r'([FY](?=[^P]))|(W(?=[^MP]))', | |
| 131 'chymotrypsin low specificity': | |
| 132 r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))', | |
| 133 'clostripain': r'R', | |
| 134 'cnbr': r'M', | |
| 135 'enterokinase': r'(?<=[DE]{3})K', | |
| 136 'factor xa': r'(?<=[AFGILTVM][DE]G)R', | |
| 137 'formic acid': r'D', | |
| 138 'glutamyl endopeptidase': r'E', | |
| 139 'granzyme b': r'(?<=IEP)D', | |
| 140 'hydroxylamine': r'N(?=G)', | |
| 141 'iodosobenzoic acid': r'W', | |
| 142 'lysc': r'K', | |
| 143 'ntcb': r'\w(?=C)', | |
| 144 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|' | |
| 145 r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))', | |
| 146 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|' | |
| 147 r'((?<=[^HKR][^P])[FL](?=\w[^P]))', | |
| 148 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])', | |
| 149 'proteinase k': r'[AEFILTVWY]', | |
| 150 'staphylococcal peptidase i': r'(?<=[^E])E', | |
| 151 'thermolysin': r'[^DE](?=[AFILMV])', | |
| 152 'thrombin': r'((?<=G)R(?=G))|' | |
| 153 r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))', | |
| 154 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))' | |
| 155 } | |
| 156 """ | |
| 157 This dict contains regular expressions for cleavage rules of the most | |
| 158 popular proteolytic enzymes. The rules were taken from the | |
| 159 `PeptideCutter tool | |
| 160 <http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_ | |
| 161 at Expasy. | |
| 162 """ |
