Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/BioseqUtils.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo |
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:ea3082881bf8 | 6:769e306b7933 |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import math | |
| 33 import re | |
| 34 from commons.core.seq.Bioseq import Bioseq | |
| 35 | |
| 36 ## Static methods for sequences manipulation | |
| 37 # | |
| 38 class BioseqUtils(object): | |
| 39 | |
| 40 ## Translate a nucleotide sequence | |
| 41 # | |
| 42 # @param bioSeqInstanceToTranslate a bioseq instance to translate | |
| 43 # @param phase a integer : 1 (default), 2 or 3 | |
| 44 # | |
| 45 def translateSequence(bioSeqInstanceToTranslate, phase=1): | |
| 46 pep = "" | |
| 47 #length = math.floor((len(self.sequence)-phase-1)/3)*3 | |
| 48 length = int( math.floor( ( len(bioSeqInstanceToTranslate.sequence )-( phase-1 ) )/3 )*3 ) | |
| 49 #We need capital letters ! | |
| 50 bioSeqInstanceToTranslate.upCase() | |
| 51 sequence = bioSeqInstanceToTranslate.sequence | |
| 52 for i in xrange(phase-1,length,3): | |
| 53 if (sequence[i:i+3] == "TTT" or sequence[i:i+3] == "TTC"): | |
| 54 pep = pep + "F" | |
| 55 elif ( sequence[i:i+3] == "TTA" or sequence[i:i+3] == "TTG" ): | |
| 56 pep = pep + "L" | |
| 57 elif ( sequence[i:i+2] == "CT" ): | |
| 58 pep = pep + "L" | |
| 59 elif ( sequence[i:i+3] == "ATT" or sequence[i:i+3] == "ATC" or sequence[i:i+3] == "ATA" ): | |
| 60 pep = pep + "I" | |
| 61 elif ( sequence[i:i+3] == "ATG" ): | |
| 62 pep = pep + "M" | |
| 63 elif ( sequence[i:i+2] == "GT" ): | |
| 64 pep = pep + "V" | |
| 65 elif ( sequence[i:i+2] == "TC" ) : | |
| 66 pep = pep + "S" | |
| 67 elif ( sequence[i:i+2] == "CC" ) : | |
| 68 pep = pep + "P" | |
| 69 elif ( sequence[i:i+2] == "AC" ) : | |
| 70 pep = pep + "T" | |
| 71 elif ( sequence[i:i+2] == "GC" ) : | |
| 72 pep = pep + "A" | |
| 73 elif ( sequence[i:i+3] == "TAT" or sequence[i:i+3] == "TAC" ) : | |
| 74 pep = pep + "Y" | |
| 75 elif ( sequence[i:i+3] == "TAA" or sequence[i:i+3] == "TAG" ) : | |
| 76 pep = pep + "*" | |
| 77 elif ( sequence[i:i+3] == "CAT" or sequence[i:i+3] == "CAC" ) : | |
| 78 pep = pep + "H" | |
| 79 elif ( sequence[i:i+3] == "CAA" or sequence[i:i+3] == "CAG" ) : | |
| 80 pep = pep + "Q" | |
| 81 elif ( sequence[i:i+3] == "AAT" or sequence[i:i+3] == "AAC" ) : | |
| 82 pep = pep + "N" | |
| 83 elif ( sequence[i:i+3] == "AAA" or sequence[i:i+3] == "AAG" ) : | |
| 84 pep = pep + "K" | |
| 85 elif ( sequence[i:i+3] == "GAT" or sequence[i:i+3] == "GAC" ) : | |
| 86 pep = pep + "D" | |
| 87 elif ( sequence[i:i+3] == "GAA" or sequence[i:i+3] == "GAG" ) : | |
| 88 pep = pep + "E" | |
| 89 elif ( sequence[i:i+3] == "TGT" or sequence[i:i+3] == "TGC" ) : | |
| 90 pep = pep + "C" | |
| 91 elif ( sequence[i:i+3] == "TGA" ) : | |
| 92 pep = pep + "*" | |
| 93 elif ( sequence[i:i+3] == "TGG" ) : | |
| 94 pep = pep + "W" | |
| 95 elif ( sequence[i:i+2] == "CG" ) : | |
| 96 pep = pep + "R" | |
| 97 elif ( sequence[i:i+3] == "AGT" or sequence[i:i+3] == "AGC" ) : | |
| 98 pep = pep + "S" | |
| 99 elif ( sequence[i:i+3] == "AGA" or sequence[i:i+3] == "AGG" ) : | |
| 100 pep = pep + "R" | |
| 101 elif ( sequence[i:i+2] == "GG" ): | |
| 102 pep = pep + "G" | |
| 103 #We don't know the amino acid because we don't have the nucleotide | |
| 104 #R Purine (A or G) | |
| 105 #Y Pyrimidine (C, T, or U) | |
| 106 #M C or A | |
| 107 #K T, U, or G | |
| 108 #W T, U, or A | |
| 109 #S C or G | |
| 110 #B C, T, U, or G (not A) | |
| 111 #D A, T, U, or G (not C) | |
| 112 #H A, T, U, or C (not G) | |
| 113 #V A, C, or G (not T, not U) | |
| 114 #N Unknown nucleotide | |
| 115 elif ( re.search("N|R|Y|M|K|W|S|B|D|H|V", sequence[i:i+3])): | |
| 116 pep = pep + "X" | |
| 117 bioSeqInstanceToTranslate.sequence = pep | |
| 118 | |
| 119 translateSequence = staticmethod(translateSequence) | |
| 120 | |
| 121 ## Add the frame info in header | |
| 122 # | |
| 123 # @param bioSeqInstance a bioseq instance to translate | |
| 124 # @param phase a integer : 1 , 2 or 3 | |
| 125 # | |
| 126 def setFrameInfoOnHeader(bioSeqInstance, phase): | |
| 127 if " " in bioSeqInstance.header: | |
| 128 name, desc = bioSeqInstance.header.split(" ", 1) | |
| 129 name = name + "_" + str(phase) | |
| 130 bioSeqInstance.header = name + " " + desc | |
| 131 else: | |
| 132 bioSeqInstance.header = bioSeqInstance.header + "_" + str(phase) | |
| 133 | |
| 134 setFrameInfoOnHeader = staticmethod(setFrameInfoOnHeader) | |
| 135 | |
| 136 ## Translate a nucleotide sequence for all frames (positives and negatives) | |
| 137 # | |
| 138 # @param bioSeqInstanceToTranslate a bioseq instance to translate | |
| 139 # | |
| 140 def translateInAllFrame( bioSeqInstanceToTranslate ): | |
| 141 positives = BioseqUtils._translateInPositiveFrames( bioSeqInstanceToTranslate ) | |
| 142 negatives = BioseqUtils._translateInNegativeFrames( bioSeqInstanceToTranslate ) | |
| 143 listAll6Frames = [] | |
| 144 listAll6Frames.extend(positives) | |
| 145 listAll6Frames.extend(negatives) | |
| 146 return listAll6Frames | |
| 147 | |
| 148 translateInAllFrame = staticmethod(translateInAllFrame) | |
| 149 | |
| 150 ## Replace the stop codons by X in sequence | |
| 151 # | |
| 152 # @param bioSeqInstance a bioseq instance | |
| 153 # | |
| 154 def replaceStopCodonsByX( bioSeqInstance ): | |
| 155 bioSeqInstance.sequence = bioSeqInstance.sequence.replace ("*", "X") | |
| 156 | |
| 157 replaceStopCodonsByX = staticmethod(replaceStopCodonsByX) | |
| 158 | |
| 159 ## Translate in a list all the frames of all the bioseq of bioseq list | |
| 160 # | |
| 161 # @param bioseqList a list of bioseq instances | |
| 162 # @return a list of translated bioseq instances | |
| 163 # | |
| 164 def translateBioseqListInAllFrames( bioseqList ): | |
| 165 bioseqListInAllFrames = [] | |
| 166 for bioseq in bioseqList : | |
| 167 bioseqListInAllFrames.extend(BioseqUtils.translateInAllFrame(bioseq)) | |
| 168 return bioseqListInAllFrames | |
| 169 | |
| 170 translateBioseqListInAllFrames = staticmethod( translateBioseqListInAllFrames ) | |
| 171 | |
| 172 ## Replace the stop codons by X for each sequence of a bioseq list | |
| 173 # | |
| 174 # @param lBioseqWithStops a list of bioseq instances | |
| 175 # @return a list of bioseq instances | |
| 176 # | |
| 177 def replaceStopCodonsByXInBioseqList ( lBioseqWithStops ): | |
| 178 bioseqListWithStopsreplaced = [] | |
| 179 for bioseq in lBioseqWithStops: | |
| 180 BioseqUtils.replaceStopCodonsByX(bioseq) | |
| 181 bioseqListWithStopsreplaced.append(bioseq) | |
| 182 return bioseqListWithStopsreplaced | |
| 183 | |
| 184 replaceStopCodonsByXInBioseqList = staticmethod( replaceStopCodonsByXInBioseqList ) | |
| 185 | |
| 186 ## Write a list of bioseq instances in a fasta file (60 characters per line) | |
| 187 # | |
| 188 # @param lBioseq a list of bioseq instances | |
| 189 # @param fileName string | |
| 190 # | |
| 191 def writeBioseqListIntoFastaFile( lBioseq, fileName ): | |
| 192 fout = open(fileName, "w") | |
| 193 for bioseq in lBioseq: | |
| 194 bioseq.write(fout) | |
| 195 fout.close() | |
| 196 | |
| 197 writeBioseqListIntoFastaFile = staticmethod( writeBioseqListIntoFastaFile ) | |
| 198 | |
| 199 ## read in a fasta file and create a list of bioseq instances | |
| 200 # | |
| 201 # @param fileName string | |
| 202 # @return a list of bioseq | |
| 203 # | |
| 204 def extractBioseqListFromFastaFile( fileName ): | |
| 205 file = open( fileName ) | |
| 206 lBioseq = [] | |
| 207 currentHeader = "" | |
| 208 while currentHeader != None: | |
| 209 bioseq = Bioseq() | |
| 210 bioseq.read(file) | |
| 211 currentHeader = bioseq.header | |
| 212 if currentHeader != None: | |
| 213 lBioseq.append(bioseq) | |
| 214 return lBioseq | |
| 215 | |
| 216 extractBioseqListFromFastaFile = staticmethod( extractBioseqListFromFastaFile ) | |
| 217 | |
| 218 ## Give the length of a sequence search by name | |
| 219 # | |
| 220 # @param lBioseq a list of bioseq instances | |
| 221 # @param seqName string | |
| 222 # @return an integer | |
| 223 # | |
| 224 def getSeqLengthWithSeqName( lBioseq, seqName ): | |
| 225 length = 0 | |
| 226 for bioseq in lBioseq: | |
| 227 if bioseq.header == seqName: | |
| 228 length = bioseq.getLength() | |
| 229 break | |
| 230 return length | |
| 231 | |
| 232 getSeqLengthWithSeqName = staticmethod( getSeqLengthWithSeqName ) | |
| 233 | |
| 234 def _translateInPositiveFrames( bioSeqInstanceToTranslate ): | |
| 235 seq1 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 236 BioseqUtils.setFrameInfoOnHeader(seq1, 1) | |
| 237 BioseqUtils.translateSequence(seq1, 1) | |
| 238 seq2 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 239 BioseqUtils.setFrameInfoOnHeader(seq2, 2) | |
| 240 BioseqUtils.translateSequence(seq2, 2) | |
| 241 seq3 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 242 BioseqUtils.setFrameInfoOnHeader(seq3, 3) | |
| 243 BioseqUtils.translateSequence(seq3, 3) | |
| 244 return [seq1, seq2, seq3] | |
| 245 | |
| 246 _translateInPositiveFrames = staticmethod( _translateInPositiveFrames ) | |
| 247 | |
| 248 def _translateInNegativeFrames(bioSeqInstanceToTranslate): | |
| 249 seq4 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 250 seq4.reverseComplement() | |
| 251 BioseqUtils.setFrameInfoOnHeader(seq4, 4) | |
| 252 BioseqUtils.translateSequence(seq4, 1) | |
| 253 seq5 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 254 seq5.reverseComplement() | |
| 255 BioseqUtils.setFrameInfoOnHeader(seq5, 5) | |
| 256 BioseqUtils.translateSequence(seq5, 2) | |
| 257 seq6 = bioSeqInstanceToTranslate.copyBioseqInstance() | |
| 258 seq6.reverseComplement() | |
| 259 BioseqUtils.setFrameInfoOnHeader(seq6, 6) | |
| 260 BioseqUtils.translateSequence(seq6, 3) | |
| 261 return [seq4, seq5, seq6] | |
| 262 | |
| 263 _translateInNegativeFrames = staticmethod( _translateInNegativeFrames ) | |
| 264 | |
| 265 | |
| 266 ## Return a dictionary which keys are sequence headers and values sequence lengths. | |
| 267 # | |
| 268 def getLengthPerSeqFromFile( inFile ): | |
| 269 dHeader2Length = {} | |
| 270 inFileHandler = open( inFile, "r" ) | |
| 271 while True: | |
| 272 iBs = Bioseq() | |
| 273 iBs.read( inFileHandler ) | |
| 274 if iBs.sequence == None: | |
| 275 break | |
| 276 dHeader2Length[ iBs.header ] = iBs.getLength() | |
| 277 inFileHandler.close() | |
| 278 return dHeader2Length | |
| 279 | |
| 280 getLengthPerSeqFromFile = staticmethod( getLengthPerSeqFromFile ) | |
| 281 | |
| 282 | |
| 283 ## Return the list of Bioseq instances, these being sorted in decreasing length | |
| 284 # | |
| 285 def getBioseqListSortedByDecreasingLength( lBioseqs ): | |
| 286 return sorted( lBioseqs, key=lambda iBs: ( iBs.getLength() ), reverse=True ) | |
| 287 | |
| 288 getBioseqListSortedByDecreasingLength = staticmethod( getBioseqListSortedByDecreasingLength ) | |
| 289 | |
| 290 | |
| 291 ## Return the list of Bioseq instances, these being sorted in decreasing length (without gaps) | |
| 292 # | |
| 293 def getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs ): | |
| 294 return sorted( lBioseqs, key=lambda iBs: ( len(iBs.sequence.replace("-","")) ), reverse=True ) | |
| 295 | |
| 296 getBioseqListSortedByDecreasingLengthWithoutGaps = staticmethod( getBioseqListSortedByDecreasingLengthWithoutGaps ) |
