Mercurial > repos > cpt > cpt_gbk_to_5col
comparison BIO_FIX_TOPO.py @ 1:1bdd481d5c25 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt |
|---|---|
| date | Mon, 05 Jun 2023 02:42:57 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:66143811fe8a | 1:1bdd481d5c25 |
|---|---|
| 1 import Bio.GenBank | |
| 2 | |
| 3 | |
| 4 def record_end(self, content): | |
| 5 """Clean up when we've finished the record.""" | |
| 6 # from Bio import Alphabet | |
| 7 # from Bio.Alphabet import IUPAC | |
| 8 from Bio.Seq import Seq, UnknownSeq | |
| 9 | |
| 10 # Try and append the version number to the accession for the full id | |
| 11 if not self.data.id: | |
| 12 assert "accessions" not in self.data.annotations, self.data.annotations[ | |
| 13 "accessions" | |
| 14 ] | |
| 15 self.data.id = self.data.name # Good fall back? | |
| 16 elif self.data.id.count(".") == 0: | |
| 17 try: | |
| 18 self.data.id += ".%i" % self.data.annotations["sequence_version"] | |
| 19 except KeyError: | |
| 20 pass | |
| 21 | |
| 22 # add the sequence information | |
| 23 # first, determine the alphabet | |
| 24 # we default to an generic alphabet if we don't have a | |
| 25 # seq type or have strange sequence information. | |
| 26 | |
| 27 # seq_alphabet = Alphabet.generic_alphabet | |
| 28 | |
| 29 # now set the sequence | |
| 30 sequence = "".join(self._seq_data) | |
| 31 | |
| 32 if ( | |
| 33 self._expected_size is not None | |
| 34 and len(sequence) != 0 | |
| 35 and self._expected_size != len(sequence) | |
| 36 ): | |
| 37 import warnings | |
| 38 from Bio import BiopythonParserWarning | |
| 39 | |
| 40 warnings.warn( | |
| 41 "Expected sequence length %i, found %i (%s)." | |
| 42 % (self._expected_size, len(sequence), self.data.id), | |
| 43 BiopythonParserWarning, | |
| 44 ) | |
| 45 """ | |
| 46 if self._seq_type: | |
| 47 # mRNA is really also DNA, since it is actually cDNA | |
| 48 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): | |
| 49 seq_alphabet = IUPAC.ambiguous_dna | |
| 50 # are there ever really RNA sequences in GenBank? | |
| 51 elif "RNA" in self._seq_type.upper(): | |
| 52 # Even for data which was from RNA, the sequence string | |
| 53 # is usually given as DNA (T not U). Bug 2408 | |
| 54 if "T" in sequence and "U" not in sequence: | |
| 55 seq_alphabet = IUPAC.ambiguous_dna | |
| 56 else: | |
| 57 seq_alphabet = IUPAC.ambiguous_rna | |
| 58 elif ( | |
| 59 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" | |
| 60 ): # PRT is used in EMBL-bank for patents | |
| 61 seq_alphabet = IUPAC.protein # or extended protein? | |
| 62 # work around ugly GenBank records which have circular or | |
| 63 # linear but no indication of sequence type | |
| 64 elif self._seq_type in ["circular", "linear", "unspecified"]: | |
| 65 pass | |
| 66 # we have a bug if we get here | |
| 67 else: | |
| 68 raise ValueError( | |
| 69 "Could not determine alphabet for seq_type %s" % self._seq_type | |
| 70 ) | |
| 71 | |
| 72 # Also save the chomosome layout | |
| 73 if "circular" in self._seq_type.lower(): | |
| 74 self.data.annotations["topology"] = "circular" | |
| 75 elif "linear" in self._seq_type.lower(): | |
| 76 self.data.annotations["topology"] = "linear" | |
| 77 """ | |
| 78 if not sequence and self.__expected_size: | |
| 79 self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet) | |
| 80 else: | |
| 81 self.data.seq = Seq(sequence) # , seq_alphabet) | |
| 82 | |
| 83 | |
| 84 Bio.GenBank._FeatureConsumer.record_end = record_end |
