Mercurial > repos > cpt > cpt_gbk_to_5col
comparison BIO_FIX_TOPO.py @ 1:1bdd481d5c25 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:42:57 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:66143811fe8a | 1:1bdd481d5c25 |
---|---|
1 import Bio.GenBank | |
2 | |
3 | |
4 def record_end(self, content): | |
5 """Clean up when we've finished the record.""" | |
6 # from Bio import Alphabet | |
7 # from Bio.Alphabet import IUPAC | |
8 from Bio.Seq import Seq, UnknownSeq | |
9 | |
10 # Try and append the version number to the accession for the full id | |
11 if not self.data.id: | |
12 assert "accessions" not in self.data.annotations, self.data.annotations[ | |
13 "accessions" | |
14 ] | |
15 self.data.id = self.data.name # Good fall back? | |
16 elif self.data.id.count(".") == 0: | |
17 try: | |
18 self.data.id += ".%i" % self.data.annotations["sequence_version"] | |
19 except KeyError: | |
20 pass | |
21 | |
22 # add the sequence information | |
23 # first, determine the alphabet | |
24 # we default to an generic alphabet if we don't have a | |
25 # seq type or have strange sequence information. | |
26 | |
27 # seq_alphabet = Alphabet.generic_alphabet | |
28 | |
29 # now set the sequence | |
30 sequence = "".join(self._seq_data) | |
31 | |
32 if ( | |
33 self._expected_size is not None | |
34 and len(sequence) != 0 | |
35 and self._expected_size != len(sequence) | |
36 ): | |
37 import warnings | |
38 from Bio import BiopythonParserWarning | |
39 | |
40 warnings.warn( | |
41 "Expected sequence length %i, found %i (%s)." | |
42 % (self._expected_size, len(sequence), self.data.id), | |
43 BiopythonParserWarning, | |
44 ) | |
45 """ | |
46 if self._seq_type: | |
47 # mRNA is really also DNA, since it is actually cDNA | |
48 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): | |
49 seq_alphabet = IUPAC.ambiguous_dna | |
50 # are there ever really RNA sequences in GenBank? | |
51 elif "RNA" in self._seq_type.upper(): | |
52 # Even for data which was from RNA, the sequence string | |
53 # is usually given as DNA (T not U). Bug 2408 | |
54 if "T" in sequence and "U" not in sequence: | |
55 seq_alphabet = IUPAC.ambiguous_dna | |
56 else: | |
57 seq_alphabet = IUPAC.ambiguous_rna | |
58 elif ( | |
59 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" | |
60 ): # PRT is used in EMBL-bank for patents | |
61 seq_alphabet = IUPAC.protein # or extended protein? | |
62 # work around ugly GenBank records which have circular or | |
63 # linear but no indication of sequence type | |
64 elif self._seq_type in ["circular", "linear", "unspecified"]: | |
65 pass | |
66 # we have a bug if we get here | |
67 else: | |
68 raise ValueError( | |
69 "Could not determine alphabet for seq_type %s" % self._seq_type | |
70 ) | |
71 | |
72 # Also save the chomosome layout | |
73 if "circular" in self._seq_type.lower(): | |
74 self.data.annotations["topology"] = "circular" | |
75 elif "linear" in self._seq_type.lower(): | |
76 self.data.annotations["topology"] = "linear" | |
77 """ | |
78 if not sequence and self.__expected_size: | |
79 self.data.seq = UnknownSeq(self._expected_size) # , seq_alphabet) | |
80 else: | |
81 self.data.seq = Seq(sequence) # , seq_alphabet) | |
82 | |
83 | |
84 Bio.GenBank._FeatureConsumer.record_end = record_end |