comparison trips_bam_to_sqlite_builtin/bam_to_sqlite.py @ 13:df4bb52b226d draft

Uploaded
author jackcurragh
date Thu, 03 Nov 2022 12:25:58 +0000
parents
children
comparison
equal deleted inserted replaced
12:02874b1b2015 13:df4bb52b226d
1 import sys
2 import pysam
3 import operator
4 import os
5 import time
6 import sqlite3
7 from sqlitedict import SqliteDict
8
9 def tran_to_genome(tran, pos, transcriptome_info_dict):
10 #print ("tran",list(transcriptome_info_dict))
11 traninfo = transcriptome_info_dict[tran]
12 chrom = traninfo["chrom"]
13 strand = traninfo["strand"]
14 exons = sorted(traninfo["exons"])
15 #print exons
16 if strand == "+":
17 exon_start = 0
18 for tup in exons:
19 exon_start = tup[0]
20 exonlen = tup[1] - tup[0]
21 if pos > exonlen:
22 pos = (pos - exonlen)-1
23 else:
24 break
25 genomic_pos = (exon_start+pos)-1
26 elif strand == "-":
27 exon_start = 0
28 for tup in exons[::-1]:
29 exon_start = tup[1]
30 exonlen = tup[1] - tup[0]
31 if pos > exonlen:
32 pos = (pos - exonlen)-1
33 else:
34 break
35 genomic_pos = (exon_start-pos)+1
36 return (chrom, genomic_pos)
37
38
39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict):
42 readlen = len(readseq)
43 ambiguously_mapped_reads = 0
44 #get the read name
45 read = list(process_chunk)[0]
46
47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions
48 #used to store different genomic poistions
49 genomic_positions = []
50
51 #This section is just to get the different genomic positions the read aligns to
52
53 for listname in process_chunk[read]:
54
55 tran = listname[0].replace("-","_").replace("(","").replace(")","")
56
57 pos = int(listname[1])
58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
59 #print ("genomic pos",genomic_pos)
60 if genomic_pos not in genomic_positions:
61 genomic_positions.append(genomic_pos)
62
63 #If the read maps unambiguously
64 if len(genomic_positions) == 1:
65 if readlen not in unambig_read_length_dict:
66 unambig_read_length_dict[readlen] = 0
67 unambig_read_length_dict[readlen] += 1
68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
69 coding=False
70
71 # For each transcript this read alings to
72 for listname in process_chunk[read]:
73 #get the transcript name
74 tran = listname[0].replace("-","_").replace("(","").replace(")","")
75 #If we haven't come across this transcript already then add to master_read_dict
76 if tran not in master_read_dict:
77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
78 #get the raw unedited positon, and read tags
79 pos = int(listname[1])
80 read_tags = listname[2]
81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
82 nm_tag = 0
83
84 for tag in read_tags:
85 if tag[0] == "NM":
86 nm_tag = int(tag[1])
87 if nm_tag > 0:
88 md_tag = ""
89 for tag in read_tags:
90 if tag[0] == "MD":
91 md_tag = tag[1]
92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
93 # Count the mismatches (we only do this for unambiguous)
94 for mismatch in mismatches:
95 #Ignore mismatches appearing in the first position (due to non templated addition)
96 if mismatch != 0:
97 char = mismatches[mismatch]
98 mismatch_pos = pos + mismatch
99 if mismatch_pos not in master_read_dict[tran]["seq"]:
100 master_read_dict[tran]["seq"][mismatch_pos] = {}
101 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
104 # apply the modifiers
105 #pos = pos+pos_modifier
106 #readlen = readlen - readlen_modifier
107
108
109 try:
110 cds_start = transcriptome_info_dict[tran]["cds_start"]
111 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
112
113 if pos >= cds_start and pos <= cds_stop:
114 coding=True
115 except:
116 pass
117
118
119 if readlen in master_read_dict[tran]["unambig"]:
120 if pos in master_read_dict[tran]["unambig"][readlen]:
121 master_read_dict[tran]["unambig"][readlen][pos] += 1
122 else:
123 master_read_dict[tran]["unambig"][readlen][pos] = 1
124 else:
125 master_read_dict[tran]["unambig"][readlen] = {pos:1}
126
127 if coding == True:
128 master_dict["unambiguous_coding_count"] += 1
129 elif coding == False:
130 master_dict["unambiguous_non_coding_count"] += 1
131
132 else:
133 ambiguously_mapped_reads += 1
134 for listname in process_chunk[read]:
135 tran = listname[0].replace("-","_").replace("(","").replace(")","")
136 if tran not in master_read_dict:
137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
138 pos = int(listname[1])
139 read_tags = listname[2]
140 nm_tag = 0
141 for tag in read_tags:
142 if tag[0] == "NM":
143 nm_tag = int(tag[1])
144 if nm_tag > 0:
145 md_tag = ""
146 for tag in read_tags:
147 if tag[0] == "MD":
148 md_tag = tag[1]
149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
150 # apply the modifiers
151 #pos = pos+pos_modifier
152 #readlen = readlen - readlen_modifier
153 if readlen in master_read_dict[tran]["ambig"]:
154 if pos in master_read_dict[tran]["ambig"][readlen]:
155 master_read_dict[tran]["ambig"][readlen][pos] += 1
156 else:
157 master_read_dict[tran]["ambig"][readlen][pos] = 1
158 else:
159 master_read_dict[tran]["ambig"][readlen] = {pos:1}
160 return ambiguously_mapped_reads
161
162
163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq):
164 nucs = ["A","T","G","C"]
165 mismatches = {}
166 total_so_far = 0
167 prev_char = ""
168 for char in md_tag:
169 if char in nucs:
170 if prev_char != "":
171 total_so_far += int(prev_char)
172 prev_char = ""
173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)])
174 else:
175 if char != "^" and char != "N":
176 if prev_char == "":
177 prev_char = char
178 else:
179 total_so_far += int(prev_char+char)
180 prev_char = ""
181 readlen_modifier = 0
182 pos_modifier = 0
183 five_ok = False
184 three_ok = False
185 while five_ok == False:
186 for i in range(0,readlen):
187 if i in mismatches:
188 pos_modifier += 1
189 readlen_modifier += 1
190 else:
191 five_ok = True
192 break
193 five_ok = True
194
195
196 while three_ok == False:
197 for i in range(readlen-1,0,-1):
198 if i in mismatches:
199 readlen_modifier += 1
200 else:
201 three_ok = True
202 break
203 three_ok = True
204
205
206 return (pos_modifier, readlen_modifier, mismatches)
207
208
209
210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile):
211 desc = "NULL"
212 start_time = time.time()
213 study_dict ={}
214 nuc_count_dict = {"mapped":{},"unmapped":{}}
215 dinuc_count_dict = {}
216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}}
217 read_length_dict = {}
218 unambig_read_length_dict = {}
219 unmapped_dict = {}
220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()}
221
222 transcriptome_info_dict = {}
223 connection = sqlite3.connect(transcriptome_info_dict_path)
224 cursor = connection.cursor()
225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;")
226 result = cursor.fetchall()
227 for row in result:
228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]}
229 #print list(transcriptome_info_dict)[:10]
230
231 cursor.execute("SELECT * from exons;")
232 result = cursor.fetchall()
233 for row in result:
234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2]))
235
236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
237 multimappers = 0
238 unmapped_reads = 0
239 unambiguous_coding_count = 0
240 unambiguous_non_coding_count = 0
241 trip_periodicity_reads = 0
242
243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}}
244 master_read_dict = {}
245 prev_seq = ""
246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]}
247 mapped_reads = 0
248 ambiguously_mapped_reads = 0
249 master_trip_dict = {"fiveprime":{}, "threeprime":{}}
250 master_offset_dict = {"fiveprime":{}, "threeprime":{}}
251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}}
252
253
254 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam')
255
256 pysam.set_verbosity(0)
257 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb")
258 header = infile.header["HD"]
259
260 unsorted = False
261 if "SO" in header:
262 print("Sorting order: "+header["SO"])
263 if header["SO"] != "queryname":
264 print("Sorting order is not queryname")
265 unsorted = True
266 else:
267 unsorted = True
268 if unsorted == True:
269 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam")
270 print (header,bam_filepath)
271 sys.exit()
272 total_bam_lines = 0
273 all_ref_ids = infile.references
274
275 for read in infile.fetch(until_eof=True):
276 total_bam_lines += 1
277 if not read.is_unmapped:
278 ref = read.reference_id
279 tran = (all_ref_ids[ref]).split(".")[0]
280 mapped_reads += 1
281 if mapped_reads%1000000 == 0:
282 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time)))
283 pos = read.reference_start
284 readname = read.query_name
285 read_tags = read.tags
286 if readname == list(process_chunk)[0]:
287 process_chunk[readname].append([tran,pos,read_tags])
288 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
289 else:
290 if list(process_chunk)[0] != "read_name":
291
292 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
293 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
294 seq = read.seq
295 readlen = len(seq)
296
297 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
298 if readlen not in read_length_dict:
299 read_length_dict[readlen] = 0
300 read_length_dict[readlen] += 1
301
302 if readlen not in nuc_count_dict["mapped"]:
303 nuc_count_dict["mapped"][readlen] = {}
304 if readlen not in threeprime_nuc_count_dict["mapped"]:
305 threeprime_nuc_count_dict["mapped"][readlen] = {}
306 if readlen not in dinuc_count_dict:
307 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0,
308 "AT":0, "TT":0, "GT":0, "CT":0,
309 "AG":0, "TG":0, "GG":0, "CG":0,
310 "AC":0, "TC":0, "GC":0, "CC":0}
311
312 for i in range(0,len(seq)):
313 if i not in nuc_count_dict["mapped"][readlen]:
314 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
315 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
316
317 for i in range(0,len(seq)):
318 try:
319 dinuc_count_dict[readlen][seq[i:i+2]] += 1
320 except:
321 pass
322
323 for i in range(len(seq),0,-1):
324 dist = i-len(seq)
325 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
326 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
327 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1
328 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict)
329 process_chunk = {readname:[[tran, pos, read_tags]]}
330 prev_seq = read.seq
331 else:
332 unmapped_reads += 1
333
334 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
335 seq = read.seq
336 readlen = len(seq)
337 if seq in unmapped_dict:
338 unmapped_dict[seq] += 1
339 else:
340 unmapped_dict[seq] = 1
341
342 # Populate the nuc_count_dict with this unmapped read
343 if readlen not in nuc_count_dict["unmapped"]:
344 nuc_count_dict["unmapped"][readlen] = {}
345 for i in range(0,len(seq)):
346 if i not in nuc_count_dict["unmapped"][readlen]:
347 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
348 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
349
350 if readlen not in threeprime_nuc_count_dict["unmapped"]:
351 threeprime_nuc_count_dict["unmapped"][readlen] = {}
352
353 for i in range(len(seq),0,-1):
354 dist = i-len(seq)
355 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
356 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
357 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
358
359 #add stats about mapped/unmapped reads to file dict which will be used for the final report
360 master_dict["total_bam_lines"] = total_bam_lines
361 master_dict["mapped_reads"] = mapped_reads
362 master_dict["unmapped_reads"] = unmapped_reads
363 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
364
365 if "read_name" in master_read_dict:
366 del master_read_dict["read_name"]
367 print ("BAM file processed")
368 print ("Creating metagenes, triplet periodicity plots, etc.")
369
370 for tran in master_read_dict:
371 try:
372 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"])
373 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"])
374 # print(tran, type(cds_start))
375 except:
376 print("Exception: ", tran)
377 continue
378
379 tranlen = transcriptome_info_dict[tran]["length"]
380 #Use this to discard transcripts with no 5' leader or 3' trailer
381 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1:
382 for primetype in ["fiveprime", "threeprime"]:
383 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
384 for readlength in master_read_dict[tran]["unambig"]:
385 #print "readlength", readlength
386 # for each fiveprime postion for this readlength within this transcript
387 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
388 #print "raw pos", raw_pos
389 trip_periodicity_reads += 1
390 if primetype == "fiveprime":
391 # get the five prime postion minus the cds start postion
392 real_pos = raw_pos-cds_start
393 rel_stop_pos = raw_pos-cds_stop
394 elif primetype == "threeprime":
395 real_pos = (raw_pos+readlength)-cds_start
396 rel_stop_pos = (raw_pos+readlength)-cds_stop
397 #get the readcount at the raw postion
398 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos]
399 #print "readcount", readcount
400 frame = (real_pos%3)
401 if real_pos >= cds_start and real_pos <= cds_stop:
402 if readlength in master_trip_dict[primetype]:
403 master_trip_dict[primetype][readlength][str(frame)] += readcount
404 else:
405 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0}
406 master_trip_dict[primetype][readlength][str(frame)] += readcount
407 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
408 if real_pos > (-600) and real_pos < (601):
409 if readlength in master_offset_dict[primetype]:
410 if real_pos in master_offset_dict[primetype][readlength]:
411 #print "real pos in offset dict"
412 master_offset_dict[primetype][readlength][real_pos] += readcount
413 else:
414 #print "real pos not in offset dict"
415 master_offset_dict[primetype][readlength][real_pos] = readcount
416 else:
417 #initiliase with zero to avoid missing neighbours below
418 #print "initialising with zeros"
419 master_offset_dict[primetype][readlength]= {}
420 for i in range(-600,601):
421 master_offset_dict[primetype][readlength][i] = 0
422 master_offset_dict[primetype][readlength][real_pos] += readcount
423
424 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
425 if rel_stop_pos > (-600) and rel_stop_pos < (601):
426 if readlength in master_metagene_stop_dict[primetype]:
427 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]:
428 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
429 else:
430 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount
431 else:
432 #initiliase with zero to avoid missing neighbours below
433 master_metagene_stop_dict[primetype][readlength] = {}
434 for i in range(-600,601):
435 master_metagene_stop_dict[primetype][readlength][i] = 0
436 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
437
438 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
439 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
440 for primetype in ["fiveprime", "threeprime"]:
441 for subreadlength in master_trip_dict[primetype]:
442 maxcount = 0
443 secondmaxcount = 0
444 for frame in master_trip_dict[primetype][subreadlength]:
445 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
446 maxcount = master_trip_dict[primetype][subreadlength][frame]
447 for frame in master_trip_dict[primetype][subreadlength]:
448 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount:
449 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
450 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
451 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount)
452 #This part is to determine what offsets to give each read length
453 print ("Calculating offsets")
454 for primetype in ["fiveprime", "threeprime"]:
455 for readlen in master_offset_dict[primetype]:
456 accepted_len = False
457 max_relative_pos = 0
458 max_relative_count = 0
459 for relative_pos in master_offset_dict[primetype][readlen]:
460 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
461 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10):
462 continue
463 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count:
464 max_relative_pos = relative_pos
465 max_relative_count = master_offset_dict[primetype][readlen][relative_pos]
466 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
467 if primetype == "fiveprime":
468 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
469 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2)
470 elif primetype == "threeprime":
471 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
472 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2
473 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
474 if readlen in master_trip_dict[primetype]:
475 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"]
476 else:
477 final_offsets[primetype]["read_scores"][readlen] = 0.0
478
479
480 master_read_dict["unmapped_reads"] = unmapped_reads
481 master_read_dict["offsets"] = final_offsets
482 master_read_dict["trip_periodicity"] = master_trip_dict
483 master_read_dict["desc"] = "Null"
484 master_read_dict["mapped_reads"] = mapped_reads
485 master_read_dict["nuc_counts"] = nuc_count_dict
486 master_read_dict["dinuc_counts"] = dinuc_count_dict
487 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
488 master_read_dict["metagene_counts"] = master_offset_dict
489 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
490 master_read_dict["read_lengths"] = read_length_dict
491 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
492 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
493 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
494 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
495 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:]
496 master_read_dict["cutadapt_removed"] = 0
497 master_read_dict["rrna_removed"] = 0
498 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
499 master_read_dict["removed_minus_m"] = 0
500 master_dict["removed_minus_m"] = 0
501 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
502 master_read_dict["unambiguous_all_totals"] = {}
503 master_read_dict["unambiguous_fiveprime_totals"] = {}
504 master_read_dict["unambiguous_cds_totals"] = {}
505 master_read_dict["unambiguous_threeprime_totals"] = {}
506
507 master_read_dict["ambiguous_all_totals"] = {}
508 master_read_dict["ambiguous_fiveprime_totals"] = {}
509 master_read_dict["ambiguous_cds_totals"] = {}
510 master_read_dict["ambiguous_threeprime_totals"] = {}
511 print ("calculating transcript counts")
512 for tran in master_read_dict:
513 if tran in transcriptome_info_dict:
514 five_total = 0
515 cds_total = 0
516 three_total = 0
517
518 ambig_five_total = 0
519 ambig_cds_total = 0
520 ambig_three_total = 0
521
522 cds_start = transcriptome_info_dict[tran]["cds_start"]
523 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
524
525 for readlen in master_read_dict[tran]["unambig"]:
526 if readlen in final_offsets["fiveprime"]["offsets"]:
527 offset = final_offsets["fiveprime"]["offsets"][readlen]
528 else:
529 offset = 15
530 for pos in master_read_dict[tran]["unambig"][readlen]:
531 real_pos = pos+offset
532 if cds_start is None or cds_stop is None:
533 three_total += master_read_dict[tran]["unambig"][readlen][pos]
534 else:
535 if real_pos <cds_start:
536 five_total += master_read_dict[tran]["unambig"][readlen][pos]
537 elif real_pos >=cds_start and real_pos <= cds_stop:
538 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
539 elif real_pos > cds_stop:
540 three_total += master_read_dict[tran]["unambig"][readlen][pos]
541 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total
542 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
543 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
544 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
545
546 for readlen in master_read_dict[tran]["ambig"]:
547 if readlen in final_offsets["fiveprime"]["offsets"]:
548 offset = final_offsets["fiveprime"]["offsets"][readlen]
549 else:
550 offset = 15
551 for pos in master_read_dict[tran]["ambig"][readlen]:
552 if cds_start is None or cds_stop is None:
553 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
554 else:
555 real_pos = pos+offset
556 if real_pos < cds_start:
557 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos]
558 elif real_pos >=cds_start and real_pos <= cds_stop:
559 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
560 elif real_pos > cds_stop:
561 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
562
563 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total
564 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total
565 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total
566 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total
567
568 print ("Writing out to sqlite file")
569 sqlite_db = SqliteDict(outputfile,autocommit=False)
570 for key in master_read_dict:
571 sqlite_db[key] = master_read_dict[key]
572 sqlite_db["description"] = desc
573 sqlite_db.commit()
574 sqlite_db.close()
575
576
577 if __name__ == "__main__":
578 if len(sys.argv) <= 2:
579 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>")
580 sys.exit()
581 bam_filepath = sys.argv[1]
582 annotation_sqlite_filepath = sys.argv[2]
583 desc = sys.argv[3]
584 outputfile = sys.argv[4]
585 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile)