4
|
1 import sys
|
|
2 import pysam
|
|
3 import operator
|
|
4 import os
|
|
5 import time
|
|
6 import sqlite3
|
|
7 from sqlitedict import SqliteDict
|
|
8
|
|
9 def tran_to_genome(tran, pos, transcriptome_info_dict):
|
9
|
10 #print ("tran",list(transcriptome_info_dict))
|
|
11 traninfo = transcriptome_info_dict[tran]
|
|
12 chrom = traninfo["chrom"]
|
|
13 strand = traninfo["strand"]
|
|
14 exons = sorted(traninfo["exons"])
|
|
15 #print exons
|
|
16 if strand == "+":
|
|
17 exon_start = 0
|
|
18 for tup in exons:
|
|
19 exon_start = tup[0]
|
|
20 exonlen = tup[1] - tup[0]
|
|
21 if pos > exonlen:
|
|
22 pos = (pos - exonlen)-1
|
|
23 else:
|
|
24 break
|
|
25 genomic_pos = (exon_start+pos)-1
|
|
26 elif strand == "-":
|
|
27 exon_start = 0
|
|
28 for tup in exons[::-1]:
|
|
29 exon_start = tup[1]
|
|
30 exonlen = tup[1] - tup[0]
|
|
31 if pos > exonlen:
|
|
32 pos = (pos - exonlen)-1
|
|
33 else:
|
|
34 break
|
|
35 genomic_pos = (exon_start-pos)+1
|
|
36 return (chrom, genomic_pos)
|
4
|
37
|
|
38
|
|
39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
|
|
40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
|
9
|
41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict):
|
|
42 readlen = len(readseq)
|
|
43 ambiguously_mapped_reads = 0
|
|
44 #get the read name
|
|
45 read = list(process_chunk)[0]
|
4
|
46
|
9
|
47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions
|
|
48 #used to store different genomic poistions
|
|
49 genomic_positions = []
|
4
|
50
|
9
|
51 #This section is just to get the different genomic positions the read aligns to
|
4
|
52
|
9
|
53 for listname in process_chunk[read]:
|
4
|
54
|
9
|
55 tran = listname[0].replace("-","_").replace("(","").replace(")","")
|
4
|
56
|
9
|
57 pos = int(listname[1])
|
|
58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
|
|
59 #print ("genomic pos",genomic_pos)
|
|
60 if genomic_pos not in genomic_positions:
|
|
61 genomic_positions.append(genomic_pos)
|
4
|
62
|
9
|
63 #If the read maps unambiguously
|
|
64 if len(genomic_positions) == 1:
|
|
65 if readlen not in unambig_read_length_dict:
|
|
66 unambig_read_length_dict[readlen] = 0
|
|
67 unambig_read_length_dict[readlen] += 1
|
|
68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
|
|
69 coding=False
|
4
|
70
|
9
|
71 # For each transcript this read alings to
|
|
72 for listname in process_chunk[read]:
|
|
73 #get the transcript name
|
|
74 tran = listname[0].replace("-","_").replace("(","").replace(")","")
|
|
75 #If we haven't come across this transcript already then add to master_read_dict
|
|
76 if tran not in master_read_dict:
|
|
77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
|
|
78 #get the raw unedited positon, and read tags
|
|
79 pos = int(listname[1])
|
|
80 read_tags = listname[2]
|
|
81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
|
|
82 nm_tag = 0
|
|
83
|
|
84 for tag in read_tags:
|
|
85 if tag[0] == "NM":
|
|
86 nm_tag = int(tag[1])
|
|
87 if nm_tag > 0:
|
|
88 md_tag = ""
|
|
89 for tag in read_tags:
|
|
90 if tag[0] == "MD":
|
|
91 md_tag = tag[1]
|
|
92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
|
|
93 # Count the mismatches (we only do this for unambiguous)
|
|
94 for mismatch in mismatches:
|
|
95 #Ignore mismatches appearing in the first position (due to non templated addition)
|
|
96 if mismatch != 0:
|
|
97 char = mismatches[mismatch]
|
|
98 mismatch_pos = pos + mismatch
|
|
99 if mismatch_pos not in master_read_dict[tran]["seq"]:
|
|
100 master_read_dict[tran]["seq"][mismatch_pos] = {}
|
|
101 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
|
|
102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
|
|
103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
|
|
104 # apply the modifiers
|
|
105 #pos = pos+pos_modifier
|
|
106 #readlen = readlen - readlen_modifier
|
4
|
107
|
|
108
|
9
|
109 try:
|
|
110 cds_start = transcriptome_info_dict[tran]["cds_start"]
|
|
111 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
|
4
|
112
|
9
|
113 if pos >= cds_start and pos <= cds_stop:
|
|
114 coding=True
|
|
115 except:
|
|
116 pass
|
4
|
117
|
|
118
|
9
|
119 if readlen in master_read_dict[tran]["unambig"]:
|
|
120 if pos in master_read_dict[tran]["unambig"][readlen]:
|
|
121 master_read_dict[tran]["unambig"][readlen][pos] += 1
|
|
122 else:
|
|
123 master_read_dict[tran]["unambig"][readlen][pos] = 1
|
|
124 else:
|
|
125 master_read_dict[tran]["unambig"][readlen] = {pos:1}
|
|
126
|
|
127 if coding == True:
|
|
128 master_dict["unambiguous_coding_count"] += 1
|
|
129 elif coding == False:
|
|
130 master_dict["unambiguous_non_coding_count"] += 1
|
|
131
|
|
132 else:
|
|
133 ambiguously_mapped_reads += 1
|
|
134 for listname in process_chunk[read]:
|
|
135 tran = listname[0].replace("-","_").replace("(","").replace(")","")
|
|
136 if tran not in master_read_dict:
|
|
137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
|
|
138 pos = int(listname[1])
|
|
139 read_tags = listname[2]
|
|
140 nm_tag = 0
|
|
141 for tag in read_tags:
|
|
142 if tag[0] == "NM":
|
|
143 nm_tag = int(tag[1])
|
|
144 if nm_tag > 0:
|
|
145 md_tag = ""
|
|
146 for tag in read_tags:
|
|
147 if tag[0] == "MD":
|
|
148 md_tag = tag[1]
|
|
149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
|
|
150 # apply the modifiers
|
|
151 #pos = pos+pos_modifier
|
|
152 #readlen = readlen - readlen_modifier
|
|
153 if readlen in master_read_dict[tran]["ambig"]:
|
|
154 if pos in master_read_dict[tran]["ambig"][readlen]:
|
|
155 master_read_dict[tran]["ambig"][readlen][pos] += 1
|
|
156 else:
|
|
157 master_read_dict[tran]["ambig"][readlen][pos] = 1
|
|
158 else:
|
|
159 master_read_dict[tran]["ambig"][readlen] = {pos:1}
|
|
160 return ambiguously_mapped_reads
|
|
161
|
4
|
162
|
9
|
163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq):
|
|
164 nucs = ["A","T","G","C"]
|
|
165 mismatches = {}
|
|
166 total_so_far = 0
|
|
167 prev_char = ""
|
|
168 for char in md_tag:
|
|
169 if char in nucs:
|
|
170 if prev_char != "":
|
|
171 total_so_far += int(prev_char)
|
|
172 prev_char = ""
|
|
173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)])
|
|
174 else:
|
|
175 if char != "^" and char != "N":
|
|
176 if prev_char == "":
|
|
177 prev_char = char
|
|
178 else:
|
|
179 total_so_far += int(prev_char+char)
|
|
180 prev_char = ""
|
|
181 readlen_modifier = 0
|
|
182 pos_modifier = 0
|
|
183 five_ok = False
|
|
184 three_ok = False
|
|
185 while five_ok == False:
|
|
186 for i in range(0,readlen):
|
|
187 if i in mismatches:
|
|
188 pos_modifier += 1
|
|
189 readlen_modifier += 1
|
|
190 else:
|
|
191 five_ok = True
|
|
192 break
|
|
193 five_ok = True
|
4
|
194
|
9
|
195
|
|
196 while three_ok == False:
|
|
197 for i in range(readlen-1,0,-1):
|
|
198 if i in mismatches:
|
|
199 readlen_modifier += 1
|
|
200 else:
|
|
201 three_ok = True
|
|
202 break
|
|
203 three_ok = True
|
|
204
|
|
205
|
|
206 return (pos_modifier, readlen_modifier, mismatches)
|
|
207
|
4
|
208
|
|
209
|
9
|
210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile):
|
|
211 desc = "NULL"
|
|
212 start_time = time.time()
|
|
213 study_dict ={}
|
|
214 nuc_count_dict = {"mapped":{},"unmapped":{}}
|
|
215 dinuc_count_dict = {}
|
|
216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}}
|
|
217 read_length_dict = {}
|
|
218 unambig_read_length_dict = {}
|
|
219 unmapped_dict = {}
|
|
220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()}
|
4
|
221
|
9
|
222 transcriptome_info_dict = {}
|
|
223 connection = sqlite3.connect(transcriptome_info_dict_path)
|
|
224 cursor = connection.cursor()
|
|
225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;")
|
|
226 result = cursor.fetchall()
|
|
227 for row in result:
|
|
228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]}
|
|
229 #print list(transcriptome_info_dict)[:10]
|
|
230
|
|
231 cursor.execute("SELECT * from exons;")
|
|
232 result = cursor.fetchall()
|
|
233 for row in result:
|
|
234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2]))
|
4
|
235
|
9
|
236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
|
|
237 multimappers = 0
|
|
238 unmapped_reads = 0
|
|
239 unambiguous_coding_count = 0
|
|
240 unambiguous_non_coding_count = 0
|
|
241 trip_periodicity_reads = 0
|
4
|
242
|
9
|
243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}}
|
|
244 master_read_dict = {}
|
|
245 prev_seq = ""
|
|
246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]}
|
|
247 mapped_reads = 0
|
|
248 ambiguously_mapped_reads = 0
|
|
249 master_trip_dict = {"fiveprime":{}, "threeprime":{}}
|
|
250 master_offset_dict = {"fiveprime":{}, "threeprime":{}}
|
|
251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}}
|
|
252
|
|
253 pysam.set_verbosity(0)
|
|
254 infile = pysam.Samfile(bam_filepath, "rb")
|
4
|
255
|
9
|
256 header = infile.header["HD"]
|
|
257 unsorted = False
|
|
258 if "SO" in header:
|
|
259 if header["SO"] != "queryname":
|
|
260 unsorted = True
|
|
261 else:
|
|
262 unsorted = True
|
|
263 if unsorted == True:
|
|
264 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam")
|
|
265 print (header,bam_filepath)
|
|
266 sys.exit()
|
|
267 total_bam_lines = 0
|
|
268 all_ref_ids = infile.references
|
4
|
269
|
9
|
270 for read in infile.fetch(until_eof=True):
|
|
271 total_bam_lines += 1
|
|
272 if not read.is_unmapped:
|
|
273 ref = read.reference_id
|
|
274 tran = (all_ref_ids[ref]).split(".")[0]
|
|
275 mapped_reads += 1
|
|
276 if mapped_reads%1000000 == 0:
|
|
277 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time)))
|
|
278 pos = read.reference_start
|
|
279 readname = read.query_name
|
|
280 read_tags = read.tags
|
|
281 if readname == list(process_chunk)[0]:
|
|
282 process_chunk[readname].append([tran,pos,read_tags])
|
|
283 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
|
|
284 else:
|
|
285 if list(process_chunk)[0] != "read_name":
|
4
|
286
|
9
|
287 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
|
|
288 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
|
|
289 seq = read.seq
|
|
290 readlen = len(seq)
|
4
|
291
|
9
|
292 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
|
|
293 if readlen not in read_length_dict:
|
|
294 read_length_dict[readlen] = 0
|
|
295 read_length_dict[readlen] += 1
|
|
296
|
|
297 if readlen not in nuc_count_dict["mapped"]:
|
|
298 nuc_count_dict["mapped"][readlen] = {}
|
|
299 if readlen not in threeprime_nuc_count_dict["mapped"]:
|
|
300 threeprime_nuc_count_dict["mapped"][readlen] = {}
|
|
301 if readlen not in dinuc_count_dict:
|
|
302 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0,
|
|
303 "AT":0, "TT":0, "GT":0, "CT":0,
|
|
304 "AG":0, "TG":0, "GG":0, "CG":0,
|
|
305 "AC":0, "TC":0, "GC":0, "CC":0}
|
4
|
306
|
9
|
307 for i in range(0,len(seq)):
|
|
308 if i not in nuc_count_dict["mapped"][readlen]:
|
|
309 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
|
|
310 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
|
|
311
|
|
312 for i in range(0,len(seq)):
|
|
313 try:
|
|
314 dinuc_count_dict[readlen][seq[i:i+2]] += 1
|
|
315 except:
|
|
316 pass
|
4
|
317
|
9
|
318 for i in range(len(seq),0,-1):
|
|
319 dist = i-len(seq)
|
|
320 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
|
|
321 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
|
|
322 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1
|
|
323 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict)
|
|
324 process_chunk = {readname:[[tran, pos, read_tags]]}
|
|
325 prev_seq = read.seq
|
|
326 else:
|
|
327 unmapped_reads += 1
|
4
|
328
|
9
|
329 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
|
|
330 seq = read.seq
|
|
331 readlen = len(seq)
|
|
332 if seq in unmapped_dict:
|
|
333 unmapped_dict[seq] += 1
|
|
334 else:
|
|
335 unmapped_dict[seq] = 1
|
4
|
336
|
9
|
337 # Populate the nuc_count_dict with this unmapped read
|
|
338 if readlen not in nuc_count_dict["unmapped"]:
|
|
339 nuc_count_dict["unmapped"][readlen] = {}
|
|
340 for i in range(0,len(seq)):
|
|
341 if i not in nuc_count_dict["unmapped"][readlen]:
|
|
342 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
|
|
343 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
|
|
344
|
|
345 if readlen not in threeprime_nuc_count_dict["unmapped"]:
|
|
346 threeprime_nuc_count_dict["unmapped"][readlen] = {}
|
4
|
347
|
9
|
348 for i in range(len(seq),0,-1):
|
|
349 dist = i-len(seq)
|
|
350 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
|
|
351 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
|
|
352 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
|
4
|
353
|
9
|
354 #add stats about mapped/unmapped reads to file dict which will be used for the final report
|
|
355 master_dict["total_bam_lines"] = total_bam_lines
|
|
356 master_dict["mapped_reads"] = mapped_reads
|
|
357 master_dict["unmapped_reads"] = unmapped_reads
|
|
358 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
|
|
359
|
|
360 if "read_name" in master_read_dict:
|
|
361 del master_read_dict["read_name"]
|
|
362 print ("BAM file processed")
|
|
363 print ("Creating metagenes, triplet periodicity plots, etc.")
|
4
|
364
|
9
|
365 for tran in master_read_dict:
|
|
366 try:
|
|
367 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"])
|
|
368 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"])
|
|
369 # print(tran, type(cds_start))
|
|
370 except:
|
|
371 print("Exception: ", tran)
|
|
372 continue
|
4
|
373
|
9
|
374 tranlen = transcriptome_info_dict[tran]["length"]
|
|
375 #Use this to discard transcripts with no 5' leader or 3' trailer
|
|
376 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1:
|
|
377 for primetype in ["fiveprime", "threeprime"]:
|
|
378 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
|
|
379 for readlength in master_read_dict[tran]["unambig"]:
|
|
380 #print "readlength", readlength
|
|
381 # for each fiveprime postion for this readlength within this transcript
|
|
382 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
|
|
383 #print "raw pos", raw_pos
|
|
384 trip_periodicity_reads += 1
|
|
385 if primetype == "fiveprime":
|
|
386 # get the five prime postion minus the cds start postion
|
|
387 real_pos = raw_pos-cds_start
|
|
388 rel_stop_pos = raw_pos-cds_stop
|
|
389 elif primetype == "threeprime":
|
|
390 real_pos = (raw_pos+readlength)-cds_start
|
|
391 rel_stop_pos = (raw_pos+readlength)-cds_stop
|
|
392 #get the readcount at the raw postion
|
|
393 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos]
|
|
394 #print "readcount", readcount
|
|
395 frame = (real_pos%3)
|
|
396 if real_pos >= cds_start and real_pos <= cds_stop:
|
|
397 if readlength in master_trip_dict[primetype]:
|
|
398 master_trip_dict[primetype][readlength][str(frame)] += readcount
|
|
399 else:
|
|
400 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0}
|
|
401 master_trip_dict[primetype][readlength][str(frame)] += readcount
|
|
402 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
|
|
403 if real_pos > (-600) and real_pos < (601):
|
|
404 if readlength in master_offset_dict[primetype]:
|
|
405 if real_pos in master_offset_dict[primetype][readlength]:
|
|
406 #print "real pos in offset dict"
|
|
407 master_offset_dict[primetype][readlength][real_pos] += readcount
|
|
408 else:
|
|
409 #print "real pos not in offset dict"
|
|
410 master_offset_dict[primetype][readlength][real_pos] = readcount
|
|
411 else:
|
|
412 #initiliase with zero to avoid missing neighbours below
|
|
413 #print "initialising with zeros"
|
|
414 master_offset_dict[primetype][readlength]= {}
|
|
415 for i in range(-600,601):
|
|
416 master_offset_dict[primetype][readlength][i] = 0
|
|
417 master_offset_dict[primetype][readlength][real_pos] += readcount
|
4
|
418
|
9
|
419 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
|
|
420 if rel_stop_pos > (-600) and rel_stop_pos < (601):
|
|
421 if readlength in master_metagene_stop_dict[primetype]:
|
|
422 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]:
|
|
423 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
|
|
424 else:
|
|
425 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount
|
|
426 else:
|
|
427 #initiliase with zero to avoid missing neighbours below
|
|
428 master_metagene_stop_dict[primetype][readlength] = {}
|
|
429 for i in range(-600,601):
|
|
430 master_metagene_stop_dict[primetype][readlength][i] = 0
|
|
431 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
|
|
432
|
|
433 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
|
|
434 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
|
|
435 for primetype in ["fiveprime", "threeprime"]:
|
|
436 for subreadlength in master_trip_dict[primetype]:
|
|
437 maxcount = 0
|
|
438 secondmaxcount = 0
|
|
439 for frame in master_trip_dict[primetype][subreadlength]:
|
|
440 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
|
|
441 maxcount = master_trip_dict[primetype][subreadlength][frame]
|
|
442 for frame in master_trip_dict[primetype][subreadlength]:
|
|
443 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount:
|
|
444 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
|
|
445 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
|
|
446 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount)
|
|
447 #This part is to determine what offsets to give each read length
|
|
448 print ("Calculating offsets")
|
|
449 for primetype in ["fiveprime", "threeprime"]:
|
|
450 for readlen in master_offset_dict[primetype]:
|
|
451 accepted_len = False
|
|
452 max_relative_pos = 0
|
|
453 max_relative_count = 0
|
|
454 for relative_pos in master_offset_dict[primetype][readlen]:
|
|
455 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
|
|
456 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10):
|
|
457 continue
|
|
458 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count:
|
|
459 max_relative_pos = relative_pos
|
|
460 max_relative_count = master_offset_dict[primetype][readlen][relative_pos]
|
|
461 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
|
|
462 if primetype == "fiveprime":
|
|
463 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
|
|
464 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2)
|
|
465 elif primetype == "threeprime":
|
|
466 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
|
|
467 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2
|
|
468 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
|
|
469 if readlen in master_trip_dict[primetype]:
|
|
470 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"]
|
|
471 else:
|
|
472 final_offsets[primetype]["read_scores"][readlen] = 0.0
|
|
473
|
4
|
474
|
9
|
475 master_read_dict["unmapped_reads"] = unmapped_reads
|
|
476 master_read_dict["offsets"] = final_offsets
|
|
477 master_read_dict["trip_periodicity"] = master_trip_dict
|
|
478 master_read_dict["desc"] = "Null"
|
|
479 master_read_dict["mapped_reads"] = mapped_reads
|
|
480 master_read_dict["nuc_counts"] = nuc_count_dict
|
|
481 master_read_dict["dinuc_counts"] = dinuc_count_dict
|
|
482 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
|
|
483 master_read_dict["metagene_counts"] = master_offset_dict
|
|
484 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
|
|
485 master_read_dict["read_lengths"] = read_length_dict
|
|
486 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
|
|
487 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
|
|
488 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
|
|
489 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
|
|
490 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:]
|
|
491 master_read_dict["cutadapt_removed"] = 0
|
|
492 master_read_dict["rrna_removed"] = 0
|
|
493 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
|
|
494 master_read_dict["removed_minus_m"] = 0
|
|
495 master_dict["removed_minus_m"] = 0
|
|
496 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
|
|
497 master_read_dict["unambiguous_all_totals"] = {}
|
|
498 master_read_dict["unambiguous_fiveprime_totals"] = {}
|
|
499 master_read_dict["unambiguous_cds_totals"] = {}
|
|
500 master_read_dict["unambiguous_threeprime_totals"] = {}
|
4
|
501
|
9
|
502 master_read_dict["ambiguous_all_totals"] = {}
|
|
503 master_read_dict["ambiguous_fiveprime_totals"] = {}
|
|
504 master_read_dict["ambiguous_cds_totals"] = {}
|
|
505 master_read_dict["ambiguous_threeprime_totals"] = {}
|
|
506 print ("calculating transcript counts")
|
|
507 for tran in master_read_dict:
|
|
508 if tran in transcriptome_info_dict:
|
|
509 five_total = 0
|
|
510 cds_total = 0
|
|
511 three_total = 0
|
|
512
|
|
513 ambig_five_total = 0
|
|
514 ambig_cds_total = 0
|
|
515 ambig_three_total = 0
|
|
516
|
|
517 cds_start = transcriptome_info_dict[tran]["cds_start"]
|
|
518 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
|
4
|
519
|
9
|
520 for readlen in master_read_dict[tran]["unambig"]:
|
|
521 if readlen in final_offsets["fiveprime"]["offsets"]:
|
|
522 offset = final_offsets["fiveprime"]["offsets"][readlen]
|
|
523 else:
|
|
524 offset = 15
|
|
525 for pos in master_read_dict[tran]["unambig"][readlen]:
|
|
526 real_pos = pos+offset
|
|
527 if cds_start is None or cds_stop is None:
|
|
528 three_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
529 else:
|
|
530 if real_pos <cds_start:
|
|
531 five_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
532 elif real_pos >=cds_start and real_pos <= cds_stop:
|
|
533 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
534 elif real_pos > cds_stop:
|
|
535 three_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
536 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total
|
|
537 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
|
|
538 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
|
|
539 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
|
4
|
540
|
9
|
541 for readlen in master_read_dict[tran]["ambig"]:
|
|
542 if readlen in final_offsets["fiveprime"]["offsets"]:
|
|
543 offset = final_offsets["fiveprime"]["offsets"][readlen]
|
|
544 else:
|
|
545 offset = 15
|
|
546 for pos in master_read_dict[tran]["ambig"][readlen]:
|
|
547 if cds_start is None or cds_stop is None:
|
|
548 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
|
|
549 else:
|
|
550 real_pos = pos+offset
|
|
551 if real_pos < cds_start:
|
|
552 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos]
|
|
553 elif real_pos >=cds_start and real_pos <= cds_stop:
|
|
554 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
|
|
555 elif real_pos > cds_stop:
|
|
556 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
|
|
557
|
|
558 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total
|
|
559 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total
|
|
560 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total
|
|
561 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total
|
|
562
|
|
563 print ("Writing out to sqlite file")
|
|
564 sqlite_db = SqliteDict(outputfile,autocommit=False)
|
|
565 for key in master_read_dict:
|
|
566 sqlite_db[key] = master_read_dict[key]
|
|
567 sqlite_db["description"] = desc
|
|
568 sqlite_db.commit()
|
|
569 sqlite_db.close()
|
4
|
570
|
|
571
|
|
572 if __name__ == "__main__":
|
9
|
573 if len(sys.argv) <= 2:
|
|
574 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>")
|
|
575 sys.exit()
|
|
576 bam_filepath = sys.argv[1]
|
|
577 annotation_sqlite_filepath = sys.argv[2]
|
|
578 #try:
|
|
579 # desc = sys.argv[3]
|
|
580 #except:
|
|
581 # desc = bam_filepath.split("/")[-1]
|
|
582 outputfile = bam_filepath+"v2.sqlite"
|
|
583 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile)
|