comparison select_product.py @ 10:82186605ba75 draft default tip

Uploaded
author brigidar
date Fri, 09 Oct 2015 17:30:50 -0400
parents 53f1f83e80a1
children
comparison
equal deleted inserted replaced
9:0c769b205d18 10:82186605ba75
36 sys.exit(error) 36 sys.exit(error)
37 37
38 return FH 38 return FH
39 39
40 40
41 def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
42 """
43 Write the feature information
44 """
45
46 lines = []
47
48 for gname, ginfo in genes.items():
49 line = [str(chr_id),
50 ginfo[3],
51 str(ginfo[0]-n),
52 str(ginfo[1]+n),
53 ginfo[2],
54 str(product)]
55
56 # to key on specfic products only
57 for k in keys_1.split(","):
58 if k in str(product):
59
60 lines.append('\t'.join(line)+"\n")
61 break
62
63 return lines
64
65
66
67
68 def gbk_parse(fname,keys_1,n,outfile=""): 41 def gbk_parse(fname,keys_1,n,outfile=""):
69 """ 42 """
70 Extract genome annotation recods from genbank format 43 Extract genome annotation recods from genbank format
71 44
72 @args fname: gbk file name 45 @args fname: gbk file name
75 fhand = open_file(gbkfname) 48 fhand = open_file(gbkfname)
76 lines = [] 49 lines = []
77 50
78 for record in SeqIO.parse(fhand, "genbank"): 51 for record in SeqIO.parse(fhand, "genbank"):
79 gene_tags = dict() 52 gene_tags = dict()
80 tx_tags = collections.defaultdict(list) 53
81 exon = collections.defaultdict(list)
82 cds = collections.defaultdict(list)
83 mol_type, chr_id = None, None 54 mol_type, chr_id = None, None
84 55
85 for rec in record.features: 56 for rec in record.features:
86 57
87 58
88 product = "NA" 59 product = "NA"
89 60
90 if rec.qualifiers.has_key('product'): 61 if rec.qualifiers.has_key('product'):
91 62
92 product = rec.qualifiers['product'][0] 63 product = rec.qualifiers['product'][0]
93 64
94 65
95 if rec.type == 'source': 66 if rec.type == 'source':
96 try: 67 try:
97 mol_type = rec.qualifiers['mol_type'][0] 68 mol_type = rec.qualifiers['mol_type'][0]
98 except: 69 except:
101 try: 72 try:
102 chr_id = rec.qualifiers['chromosome'][0] 73 chr_id = rec.qualifiers['chromosome'][0]
103 except: 74 except:
104 chr_id = record.name 75 chr_id = record.name
105 continue 76 continue
77
106 78
107 strand='-' 79 start = int(rec.location.start)+1-n
108 strand='+' if rec.strand>0 else strand 80 stop = int(rec.location.end)+n
109
110 fid = None
111 try:
112 fid = rec.qualifiers['gene'][0]
113 except:
114 pass
115 81
116 transcript_id = None 82 if start < 0:
117 try: 83 start = 1
118 transcript_id = rec.qualifiers['transcript_id'][0] 84
119 except: 85 for k in keys_1.split(","):
120 pass 86
87 if str(k) in str(product):
88 lines.append('\t'.join([str(chr_id),str(start), str(stop), product])+"\n")
121 89
122 if re.search(r'gene', rec.type): 90
123 gene_tags[fid] = (rec.location._start.position+1,
124 rec.location._end.position,
125 strand,
126 rec.type
127 )
128 elif rec.type == 'exon':
129 exon[fid].append((rec.location._start.position+1,
130 rec.location._end.position))
131 elif rec.type=='CDS':
132 cds[fid].append((rec.location._start.position+1,
133 rec.location._end.position))
134 else:
135 # get all transcripts
136 if transcript_id:
137 tx_tags[fid].append((rec.location._start.position+1,
138 rec.location._end.position,
139 transcript_id,
140 rec.type))
141 # record extracted, generate feature table
142 lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
143
144 fhand.close() 91 fhand.close()
145 92
146 return lines 93 return lines
147 94
148 if __name__=='__main__': 95 if __name__=='__main__':
149 96
150 try: 97 try:
151 parser = argparse.ArgumentParser() 98 parser = argparse.ArgumentParser()
152 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list") 99 parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
153 parser.add_argument('-g', '--gbk_file', help="genbank file of reference") 100 parser.add_argument('-g', '--gbk_file', help="genbank file of reference")