annotate cpt_gbk_to_5col/BIO_FIX_TOPO.py @ 0:66143811fe8a draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:45:08 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
66143811fe8a Uploaded
cpt
parents:
diff changeset
1 import Bio.GenBank
66143811fe8a Uploaded
cpt
parents:
diff changeset
2
66143811fe8a Uploaded
cpt
parents:
diff changeset
3
66143811fe8a Uploaded
cpt
parents:
diff changeset
4 def record_end(self, content):
66143811fe8a Uploaded
cpt
parents:
diff changeset
5 """Clean up when we've finished the record.
66143811fe8a Uploaded
cpt
parents:
diff changeset
6 """
66143811fe8a Uploaded
cpt
parents:
diff changeset
7 #from Bio import Alphabet
66143811fe8a Uploaded
cpt
parents:
diff changeset
8 #from Bio.Alphabet import IUPAC
66143811fe8a Uploaded
cpt
parents:
diff changeset
9 from Bio.Seq import Seq, UnknownSeq
66143811fe8a Uploaded
cpt
parents:
diff changeset
10
66143811fe8a Uploaded
cpt
parents:
diff changeset
11 # Try and append the version number to the accession for the full id
66143811fe8a Uploaded
cpt
parents:
diff changeset
12 if not self.data.id:
66143811fe8a Uploaded
cpt
parents:
diff changeset
13 assert "accessions" not in self.data.annotations, self.data.annotations[
66143811fe8a Uploaded
cpt
parents:
diff changeset
14 "accessions"
66143811fe8a Uploaded
cpt
parents:
diff changeset
15 ]
66143811fe8a Uploaded
cpt
parents:
diff changeset
16 self.data.id = self.data.name # Good fall back?
66143811fe8a Uploaded
cpt
parents:
diff changeset
17 elif self.data.id.count(".") == 0:
66143811fe8a Uploaded
cpt
parents:
diff changeset
18 try:
66143811fe8a Uploaded
cpt
parents:
diff changeset
19 self.data.id += ".%i" % self.data.annotations["sequence_version"]
66143811fe8a Uploaded
cpt
parents:
diff changeset
20 except KeyError:
66143811fe8a Uploaded
cpt
parents:
diff changeset
21 pass
66143811fe8a Uploaded
cpt
parents:
diff changeset
22
66143811fe8a Uploaded
cpt
parents:
diff changeset
23 # add the sequence information
66143811fe8a Uploaded
cpt
parents:
diff changeset
24 # first, determine the alphabet
66143811fe8a Uploaded
cpt
parents:
diff changeset
25 # we default to an generic alphabet if we don't have a
66143811fe8a Uploaded
cpt
parents:
diff changeset
26 # seq type or have strange sequence information.
66143811fe8a Uploaded
cpt
parents:
diff changeset
27
66143811fe8a Uploaded
cpt
parents:
diff changeset
28 #seq_alphabet = Alphabet.generic_alphabet
66143811fe8a Uploaded
cpt
parents:
diff changeset
29
66143811fe8a Uploaded
cpt
parents:
diff changeset
30 # now set the sequence
66143811fe8a Uploaded
cpt
parents:
diff changeset
31 sequence = "".join(self._seq_data)
66143811fe8a Uploaded
cpt
parents:
diff changeset
32
66143811fe8a Uploaded
cpt
parents:
diff changeset
33 if (
66143811fe8a Uploaded
cpt
parents:
diff changeset
34 self._expected_size is not None
66143811fe8a Uploaded
cpt
parents:
diff changeset
35 and len(sequence) != 0
66143811fe8a Uploaded
cpt
parents:
diff changeset
36 and self._expected_size != len(sequence)
66143811fe8a Uploaded
cpt
parents:
diff changeset
37 ):
66143811fe8a Uploaded
cpt
parents:
diff changeset
38 import warnings
66143811fe8a Uploaded
cpt
parents:
diff changeset
39 from Bio import BiopythonParserWarning
66143811fe8a Uploaded
cpt
parents:
diff changeset
40
66143811fe8a Uploaded
cpt
parents:
diff changeset
41 warnings.warn(
66143811fe8a Uploaded
cpt
parents:
diff changeset
42 "Expected sequence length %i, found %i (%s)."
66143811fe8a Uploaded
cpt
parents:
diff changeset
43 % (self._expected_size, len(sequence), self.data.id),
66143811fe8a Uploaded
cpt
parents:
diff changeset
44 BiopythonParserWarning,
66143811fe8a Uploaded
cpt
parents:
diff changeset
45 )
66143811fe8a Uploaded
cpt
parents:
diff changeset
46 """
66143811fe8a Uploaded
cpt
parents:
diff changeset
47 if self._seq_type:
66143811fe8a Uploaded
cpt
parents:
diff changeset
48 # mRNA is really also DNA, since it is actually cDNA
66143811fe8a Uploaded
cpt
parents:
diff changeset
49 if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
66143811fe8a Uploaded
cpt
parents:
diff changeset
50 seq_alphabet = IUPAC.ambiguous_dna
66143811fe8a Uploaded
cpt
parents:
diff changeset
51 # are there ever really RNA sequences in GenBank?
66143811fe8a Uploaded
cpt
parents:
diff changeset
52 elif "RNA" in self._seq_type.upper():
66143811fe8a Uploaded
cpt
parents:
diff changeset
53 # Even for data which was from RNA, the sequence string
66143811fe8a Uploaded
cpt
parents:
diff changeset
54 # is usually given as DNA (T not U). Bug 2408
66143811fe8a Uploaded
cpt
parents:
diff changeset
55 if "T" in sequence and "U" not in sequence:
66143811fe8a Uploaded
cpt
parents:
diff changeset
56 seq_alphabet = IUPAC.ambiguous_dna
66143811fe8a Uploaded
cpt
parents:
diff changeset
57 else:
66143811fe8a Uploaded
cpt
parents:
diff changeset
58 seq_alphabet = IUPAC.ambiguous_rna
66143811fe8a Uploaded
cpt
parents:
diff changeset
59 elif (
66143811fe8a Uploaded
cpt
parents:
diff changeset
60 "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
66143811fe8a Uploaded
cpt
parents:
diff changeset
61 ): # PRT is used in EMBL-bank for patents
66143811fe8a Uploaded
cpt
parents:
diff changeset
62 seq_alphabet = IUPAC.protein # or extended protein?
66143811fe8a Uploaded
cpt
parents:
diff changeset
63 # work around ugly GenBank records which have circular or
66143811fe8a Uploaded
cpt
parents:
diff changeset
64 # linear but no indication of sequence type
66143811fe8a Uploaded
cpt
parents:
diff changeset
65 elif self._seq_type in ["circular", "linear", "unspecified"]:
66143811fe8a Uploaded
cpt
parents:
diff changeset
66 pass
66143811fe8a Uploaded
cpt
parents:
diff changeset
67 # we have a bug if we get here
66143811fe8a Uploaded
cpt
parents:
diff changeset
68 else:
66143811fe8a Uploaded
cpt
parents:
diff changeset
69 raise ValueError(
66143811fe8a Uploaded
cpt
parents:
diff changeset
70 "Could not determine alphabet for seq_type %s" % self._seq_type
66143811fe8a Uploaded
cpt
parents:
diff changeset
71 )
66143811fe8a Uploaded
cpt
parents:
diff changeset
72
66143811fe8a Uploaded
cpt
parents:
diff changeset
73 # Also save the chomosome layout
66143811fe8a Uploaded
cpt
parents:
diff changeset
74 if "circular" in self._seq_type.lower():
66143811fe8a Uploaded
cpt
parents:
diff changeset
75 self.data.annotations["topology"] = "circular"
66143811fe8a Uploaded
cpt
parents:
diff changeset
76 elif "linear" in self._seq_type.lower():
66143811fe8a Uploaded
cpt
parents:
diff changeset
77 self.data.annotations["topology"] = "linear"
66143811fe8a Uploaded
cpt
parents:
diff changeset
78 """
66143811fe8a Uploaded
cpt
parents:
diff changeset
79 if not sequence and self.__expected_size:
66143811fe8a Uploaded
cpt
parents:
diff changeset
80 self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet)
66143811fe8a Uploaded
cpt
parents:
diff changeset
81 else:
66143811fe8a Uploaded
cpt
parents:
diff changeset
82 self.data.seq = Seq(sequence)#, seq_alphabet)
66143811fe8a Uploaded
cpt
parents:
diff changeset
83
66143811fe8a Uploaded
cpt
parents:
diff changeset
84
66143811fe8a Uploaded
cpt
parents:
diff changeset
85 Bio.GenBank._FeatureConsumer.record_end = record_end