comparison trips_bam_to_sqlite/bam_to_sqlite.py @ 4:4ee95ba271a5 draft

Uploaded
author triasteran
date Mon, 25 Apr 2022 12:40:36 +0000
parents
children 1be4bbe849e3
comparison
equal deleted inserted replaced
3:932cdd84d51a 4:4ee95ba271a5
1 import sys
2 import pysam
3 import operator
4 import os
5 import time
6 import sqlite3
7 from sqlitedict import SqliteDict
8
9
10 def tran_to_genome(tran, pos, transcriptome_info_dict):
11 # print ("tran",list(transcriptome_info_dict))
12 traninfo = transcriptome_info_dict[tran]
13 chrom = traninfo["chrom"]
14 strand = traninfo["strand"]
15 exons = sorted(traninfo["exons"])
16 # print exons
17 if strand == "+":
18 exon_start = 0
19 for tup in exons:
20 exon_start = tup[0]
21 exonlen = tup[1] - tup[0]
22 if pos > exonlen:
23 pos = (pos - exonlen) - 1
24 else:
25 break
26 genomic_pos = (exon_start + pos) - 1
27 elif strand == "-":
28 exon_start = 0
29 for tup in exons[::-1]:
30 exon_start = tup[1]
31 exonlen = tup[1] - tup[0]
32 if pos > exonlen:
33 pos = (pos - exonlen) - 1
34 else:
35 break
36 genomic_pos = (exon_start - pos) + 1
37 return (chrom, genomic_pos)
38
39
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
42 def processor(
43 process_chunk,
44 master_read_dict,
45 transcriptome_info_dict,
46 master_dict,
47 readseq,
48 unambig_read_length_dict,
49 ):
50 readlen = len(readseq)
51 ambiguously_mapped_reads = 0
52 # get the read name
53 read = list(process_chunk)[0]
54
55 read_list = process_chunk[
56 read
57 ] # a list of lists of all transcripts the read aligns to and the positions
58 # used to store different genomic poistions
59 genomic_positions = []
60
61 # This section is just to get the different genomic positions the read aligns to
62
63 for listname in process_chunk[read]:
64
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
66
67 pos = int(listname[1])
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
69 # print ("genomic pos",genomic_pos)
70 if genomic_pos not in genomic_positions:
71 genomic_positions.append(genomic_pos)
72
73 # If the read maps unambiguously
74 if len(genomic_positions) == 1:
75 if readlen not in unambig_read_length_dict:
76 unambig_read_length_dict[readlen] = 0
77 unambig_read_length_dict[readlen] += 1
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
79 coding = False
80
81 # For each transcript this read alings to
82 for listname in process_chunk[read]:
83 # get the transcript name
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
85 # If we haven't come across this transcript already then add to master_read_dict
86 if tran not in master_read_dict:
87 master_read_dict[tran] = {
88 "ambig": {},
89 "unambig": {},
90 "mismatches": {},
91 "seq": {},
92 }
93 # get the raw unedited positon, and read tags
94 pos = int(listname[1])
95 read_tags = listname[2]
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
97 nm_tag = 0
98
99 for tag in read_tags:
100 if tag[0] == "NM":
101 nm_tag = int(tag[1])
102 if nm_tag > 0:
103 md_tag = ""
104 for tag in read_tags:
105 if tag[0] == "MD":
106 md_tag = tag[1]
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
108 md_tag, pos, readlen, master_read_dict, tran, readseq
109 )
110 # Count the mismatches (we only do this for unambiguous)
111 for mismatch in mismatches:
112 # Ignore mismatches appearing in the first position (due to non templated addition)
113 if mismatch != 0:
114 char = mismatches[mismatch]
115 mismatch_pos = pos + mismatch
116 if mismatch_pos not in master_read_dict[tran]["seq"]:
117 master_read_dict[tran]["seq"][mismatch_pos] = {}
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
121 # apply the modifiers
122 # pos = pos+pos_modifier
123 # readlen = readlen - readlen_modifier
124
125 try:
126 cds_start = transcriptome_info_dict[tran]["cds_start"]
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
128
129 if pos >= cds_start and pos <= cds_stop:
130 coding = True
131 except:
132 pass
133
134 if readlen in master_read_dict[tran]["unambig"]:
135 if pos in master_read_dict[tran]["unambig"][readlen]:
136 master_read_dict[tran]["unambig"][readlen][pos] += 1
137 else:
138 master_read_dict[tran]["unambig"][readlen][pos] = 1
139 else:
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1}
141
142 if coding == True:
143 master_dict["unambiguous_coding_count"] += 1
144 elif coding == False:
145 master_dict["unambiguous_non_coding_count"] += 1
146
147 else:
148 ambiguously_mapped_reads += 1
149 for listname in process_chunk[read]:
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
151 if tran not in master_read_dict:
152 master_read_dict[tran] = {
153 "ambig": {},
154 "unambig": {},
155 "mismatches": {},
156 "seq": {},
157 }
158 pos = int(listname[1])
159 read_tags = listname[2]
160 nm_tag = 0
161 for tag in read_tags:
162 if tag[0] == "NM":
163 nm_tag = int(tag[1])
164 if nm_tag > 0:
165 md_tag = ""
166 for tag in read_tags:
167 if tag[0] == "MD":
168 md_tag = tag[1]
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
170 md_tag, pos, readlen, master_read_dict, tran, readseq
171 )
172 # apply the modifiers
173 # pos = pos+pos_modifier
174 # readlen = readlen - readlen_modifier
175 if readlen in master_read_dict[tran]["ambig"]:
176 if pos in master_read_dict[tran]["ambig"][readlen]:
177 master_read_dict[tran]["ambig"][readlen][pos] += 1
178 else:
179 master_read_dict[tran]["ambig"][readlen][pos] = 1
180 else:
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1}
182 return ambiguously_mapped_reads
183
184
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq):
186 nucs = ["A", "T", "G", "C"]
187 mismatches = {}
188 total_so_far = 0
189 prev_char = ""
190 for char in md_tag:
191 if char in nucs:
192 if prev_char != "":
193 total_so_far += int(prev_char)
194 prev_char = ""
195 mismatches[total_so_far + len(mismatches)] = readseq[
196 total_so_far + len(mismatches)
197 ]
198 else:
199 if char != "^" and char != "N":
200 if prev_char == "":
201 prev_char = char
202 else:
203 total_so_far += int(prev_char + char)
204 prev_char = ""
205 readlen_modifier = 0
206 pos_modifier = 0
207 five_ok = False
208 three_ok = False
209 while five_ok == False:
210 for i in range(0, readlen):
211 if i in mismatches:
212 pos_modifier += 1
213 readlen_modifier += 1
214 else:
215 five_ok = True
216 break
217 five_ok = True
218
219 while three_ok == False:
220 for i in range(readlen - 1, 0, -1):
221 if i in mismatches:
222 readlen_modifier += 1
223 else:
224 three_ok = True
225 break
226 three_ok = True
227
228 return (pos_modifier, readlen_modifier, mismatches)
229
230
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile, desc):
232 desc = desc
233 start_time = time.time()
234 study_dict = {}
235 nuc_count_dict = {"mapped": {}, "unmapped": {}}
236 dinuc_count_dict = {}
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}}
238 read_length_dict = {}
239 unambig_read_length_dict = {}
240 unmapped_dict = {}
241 master_dict = {
242 "unambiguous_non_coding_count": 0,
243 "unambiguous_coding_count": 0,
244 "current_dir": os.getcwd(),
245 }
246
247 transcriptome_info_dict = {}
248 connection = sqlite3.connect(transcriptome_info_dict_path)
249 cursor = connection.cursor()
250 cursor.execute(
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;"
252 )
253 result = cursor.fetchall()
254 for row in result:
255 transcriptome_info_dict[str(row[0])] = {
256 "cds_start": row[1],
257 "cds_stop": row[2],
258 "length": row[3],
259 "strand": row[4],
260 "chrom": row[5],
261 "exons": [],
262 "tran_type": row[6],
263 }
264 # print list(transcriptome_info_dict)[:10]
265
266 cursor.execute("SELECT * from exons;")
267 result = cursor.fetchall()
268 for row in result:
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2]))
270
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
272 multimappers = 0
273 unmapped_reads = 0
274 unambiguous_coding_count = 0
275 unambiguous_non_coding_count = 0
276 trip_periodicity_reads = 0
277
278 final_offsets = {
279 "fiveprime": {"offsets": {}, "read_scores": {}},
280 "threeprime": {"offsets": {}, "read_scores": {}},
281 }
282 master_read_dict = {}
283 prev_seq = ""
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]}
285 mapped_reads = 0
286 ambiguously_mapped_reads = 0
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}}
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}}
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}}
290
291 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam')
292 pysam.set_verbosity(0)
293 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb")
294 header = infile.header["HD"]
295 unsorted = False
296 if "SO" in header:
297 if header["SO"] != "queryname":
298 unsorted = True
299 else:
300 unsorted = True
301 if unsorted == True:
302 print(
303 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam"
304 )
305 print(header, bam_filepath)
306 sys.exit()
307 total_bam_lines = 0
308 all_ref_ids = infile.references
309
310 for read in infile.fetch(until_eof=True):
311 total_bam_lines += 1
312 if not read.is_unmapped:
313 ref = read.reference_id
314 tran = (all_ref_ids[ref]).split(".")[0]
315 mapped_reads += 1
316 if mapped_reads % 1000000 == 0:
317 print(
318 "{} reads parsed at {}".format(
319 mapped_reads, (time.time() - start_time)
320 )
321 )
322 pos = read.reference_start
323 readname = read.query_name
324 read_tags = read.tags
325 if readname == list(process_chunk)[0]:
326 process_chunk[readname].append([tran, pos, read_tags])
327 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
328 else:
329 if list(process_chunk)[0] != "read_name":
330
331 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
332 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
333 seq = read.seq
334 readlen = len(seq)
335
336 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
337 if readlen not in read_length_dict:
338 read_length_dict[readlen] = 0
339 read_length_dict[readlen] += 1
340
341 if readlen not in nuc_count_dict["mapped"]:
342 nuc_count_dict["mapped"][readlen] = {}
343 if readlen not in threeprime_nuc_count_dict["mapped"]:
344 threeprime_nuc_count_dict["mapped"][readlen] = {}
345 if readlen not in dinuc_count_dict:
346 dinuc_count_dict[readlen] = {
347 "AA": 0,
348 "TA": 0,
349 "GA": 0,
350 "CA": 0,
351 "AT": 0,
352 "TT": 0,
353 "GT": 0,
354 "CT": 0,
355 "AG": 0,
356 "TG": 0,
357 "GG": 0,
358 "CG": 0,
359 "AC": 0,
360 "TC": 0,
361 "GC": 0,
362 "CC": 0,
363 }
364
365 for i in range(0, len(seq)):
366 if i not in nuc_count_dict["mapped"][readlen]:
367 nuc_count_dict["mapped"][readlen][i] = {
368 "A": 0,
369 "T": 0,
370 "G": 0,
371 "C": 0,
372 "N": 0,
373 }
374 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
375
376 for i in range(0, len(seq)):
377 try:
378 dinuc_count_dict[readlen][seq[i : i + 2]] += 1
379 except:
380 pass
381
382 for i in range(len(seq), 0, -1):
383 dist = i - len(seq)
384 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
385 threeprime_nuc_count_dict["mapped"][readlen][dist] = {
386 "A": 0,
387 "T": 0,
388 "G": 0,
389 "C": 0,
390 "N": 0,
391 }
392 threeprime_nuc_count_dict["mapped"][readlen][dist][
393 seq[dist]
394 ] += 1
395 ambiguously_mapped_reads += processor(
396 process_chunk,
397 master_read_dict,
398 transcriptome_info_dict,
399 master_dict,
400 prev_seq,
401 unambig_read_length_dict,
402 )
403 process_chunk = {readname: [[tran, pos, read_tags]]}
404 prev_seq = read.seq
405 else:
406 unmapped_reads += 1
407
408 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
409 seq = read.seq
410 readlen = len(seq)
411 if seq in unmapped_dict:
412 unmapped_dict[seq] += 1
413 else:
414 unmapped_dict[seq] = 1
415
416 # Populate the nuc_count_dict with this unmapped read
417 if readlen not in nuc_count_dict["unmapped"]:
418 nuc_count_dict["unmapped"][readlen] = {}
419 for i in range(0, len(seq)):
420 if i not in nuc_count_dict["unmapped"][readlen]:
421 nuc_count_dict["unmapped"][readlen][i] = {
422 "A": 0,
423 "T": 0,
424 "G": 0,
425 "C": 0,
426 "N": 0,
427 }
428 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
429
430 if readlen not in threeprime_nuc_count_dict["unmapped"]:
431 threeprime_nuc_count_dict["unmapped"][readlen] = {}
432
433 for i in range(len(seq), 0, -1):
434 dist = i - len(seq)
435 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
436 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {
437 "A": 0,
438 "T": 0,
439 "G": 0,
440 "C": 0,
441 "N": 0,
442 }
443 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
444
445 # add stats about mapped/unmapped reads to file dict which will be used for the final report
446 master_dict["total_bam_lines"] = total_bam_lines
447 master_dict["mapped_reads"] = mapped_reads
448 master_dict["unmapped_reads"] = unmapped_reads
449 master_read_dict["unmapped_reads"] = unmapped_reads
450 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
451
452 if "read_name" in master_read_dict:
453 del master_read_dict["read_name"]
454 print("BAM file processed")
455 print("Creating metagenes, triplet periodicity plots, etc.")
456 for tran in master_read_dict:
457 try:
458 cds_start = transcriptome_info_dict[tran]["cds_start"]
459 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
460 except:
461 continue
462
463 tranlen = transcriptome_info_dict[tran]["length"]
464 # Use this to discard transcripts with no 5' leader or 3' trailer
465 if (
466 cds_start > 1
467 and cds_stop < tranlen
468 and transcriptome_info_dict[tran]["tran_type"] == 1
469 ):
470 for primetype in ["fiveprime", "threeprime"]:
471 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
472 for readlength in master_read_dict[tran]["unambig"]:
473 # print "readlength", readlength
474 # for each fiveprime postion for this readlength within this transcript
475 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
476 # print "raw pos", raw_pos
477 trip_periodicity_reads += 1
478 if primetype == "fiveprime":
479 # get the five prime postion minus the cds start postion
480 real_pos = raw_pos - cds_start
481 rel_stop_pos = raw_pos - cds_stop
482 elif primetype == "threeprime":
483 real_pos = (raw_pos + readlength) - cds_start
484 rel_stop_pos = (raw_pos + readlength) - cds_stop
485 # get the readcount at the raw postion
486 readcount = master_read_dict[tran]["unambig"][readlength][
487 raw_pos
488 ]
489 # print "readcount", readcount
490 frame = real_pos % 3
491 if real_pos >= cds_start and real_pos <= cds_stop:
492 if readlength in master_trip_dict[primetype]:
493 master_trip_dict[primetype][readlength][
494 str(frame)
495 ] += readcount
496 else:
497 master_trip_dict[primetype][readlength] = {
498 "0": 0.0,
499 "1": 0.0,
500 "2": 0.0,
501 }
502 master_trip_dict[primetype][readlength][
503 str(frame)
504 ] += readcount
505 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
506 if real_pos > (-600) and real_pos < (601):
507 if readlength in master_offset_dict[primetype]:
508 if (
509 real_pos
510 in master_offset_dict[primetype][readlength]
511 ):
512 # print "real pos in offset dict"
513 master_offset_dict[primetype][readlength][
514 real_pos
515 ] += readcount
516 else:
517 # print "real pos not in offset dict"
518 master_offset_dict[primetype][readlength][
519 real_pos
520 ] = readcount
521 else:
522 # initiliase with zero to avoid missing neighbours below
523 # print "initialising with zeros"
524 master_offset_dict[primetype][readlength] = {}
525 for i in range(-600, 601):
526 master_offset_dict[primetype][readlength][i] = 0
527 master_offset_dict[primetype][readlength][
528 real_pos
529 ] += readcount
530
531 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
532 if rel_stop_pos > (-600) and rel_stop_pos < (601):
533 if readlength in master_metagene_stop_dict[primetype]:
534 if (
535 rel_stop_pos
536 in master_metagene_stop_dict[primetype][readlength]
537 ):
538 master_metagene_stop_dict[primetype][readlength][
539 rel_stop_pos
540 ] += readcount
541 else:
542 master_metagene_stop_dict[primetype][readlength][
543 rel_stop_pos
544 ] = readcount
545 else:
546 # initiliase with zero to avoid missing neighbours below
547 master_metagene_stop_dict[primetype][readlength] = {}
548 for i in range(-600, 601):
549 master_metagene_stop_dict[primetype][readlength][
550 i
551 ] = 0
552 master_metagene_stop_dict[primetype][readlength][
553 rel_stop_pos
554 ] += readcount
555
556 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
557 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
558 for primetype in ["fiveprime", "threeprime"]:
559 for subreadlength in master_trip_dict[primetype]:
560 maxcount = 0
561 secondmaxcount = 0
562 for frame in master_trip_dict[primetype][subreadlength]:
563 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
564 maxcount = master_trip_dict[primetype][subreadlength][frame]
565 for frame in master_trip_dict[primetype][subreadlength]:
566 if (
567 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount
568 and master_trip_dict[primetype][subreadlength][frame] != maxcount
569 ):
570 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
571 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
572 master_trip_dict[primetype][subreadlength]["score"] = float(
573 secondmaxcount
574 ) / float(maxcount)
575 # This part is to determine what offsets to give each read length
576 print("Calculating offsets")
577 for primetype in ["fiveprime", "threeprime"]:
578 for readlen in master_offset_dict[primetype]:
579 accepted_len = False
580 max_relative_pos = 0
581 max_relative_count = 0
582 for relative_pos in master_offset_dict[primetype][readlen]:
583 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
584 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10):
585 continue
586 if (
587 master_offset_dict[primetype][readlen][relative_pos]
588 > max_relative_count
589 ):
590 max_relative_pos = relative_pos
591 max_relative_count = master_offset_dict[primetype][readlen][
592 relative_pos
593 ]
594 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
595 if primetype == "fiveprime":
596 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
597 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2)
598 elif primetype == "threeprime":
599 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
600 final_offsets[primetype]["offsets"][readlen] = (
601 max_relative_pos * (-1)
602 ) + 2
603 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
604 if readlen in master_trip_dict[primetype]:
605 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[
606 primetype
607 ][readlen]["score"]
608 else:
609 final_offsets[primetype]["read_scores"][readlen] = 0.0
610 master_read_dict["offsets"] = final_offsets
611 master_read_dict["trip_periodicity"] = master_trip_dict
612 master_read_dict["desc"] = "Null"
613 master_read_dict["mapped_reads"] = mapped_reads
614 master_read_dict["nuc_counts"] = nuc_count_dict
615 master_read_dict["dinuc_counts"] = dinuc_count_dict
616 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
617 master_read_dict["metagene_counts"] = master_offset_dict
618 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
619 master_read_dict["read_lengths"] = read_length_dict
620 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
621 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
622 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
623 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
624 master_read_dict["frequent_unmapped_reads"] = (
625 sorted(unmapped_dict.items(), key=operator.itemgetter(1))
626 )[-2000:]
627 master_read_dict["cutadapt_removed"] = 0
628 master_read_dict["rrna_removed"] = 0
629 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
630 master_read_dict["removed_minus_m"] = 0
631 master_dict["removed_minus_m"] = 0
632 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
633 master_read_dict["unambiguous_all_totals"] = {}
634 master_read_dict["unambiguous_fiveprime_totals"] = {}
635 master_read_dict["unambiguous_cds_totals"] = {}
636 master_read_dict["unambiguous_threeprime_totals"] = {}
637
638 master_read_dict["ambiguous_all_totals"] = {}
639 master_read_dict["ambiguous_fiveprime_totals"] = {}
640 master_read_dict["ambiguous_cds_totals"] = {}
641 master_read_dict["ambiguous_threeprime_totals"] = {}
642 print("calculating transcript counts")
643 for tran in master_read_dict:
644 if tran in transcriptome_info_dict:
645 five_total = 0
646 cds_total = 0
647 three_total = 0
648
649 ambig_five_total = 0
650 ambig_cds_total = 0
651 ambig_three_total = 0
652
653 cds_start = transcriptome_info_dict[tran]["cds_start"]
654 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
655 for readlen in master_read_dict[tran]["unambig"]:
656 if readlen in final_offsets["fiveprime"]["offsets"]:
657 offset = final_offsets["fiveprime"]["offsets"][readlen]
658 else:
659 offset = 15
660 for pos in master_read_dict[tran]["unambig"][readlen]:
661 real_pos = pos + offset
662 if real_pos < cds_start:
663 five_total += master_read_dict[tran]["unambig"][readlen][pos]
664 elif real_pos >= cds_start and real_pos <= cds_stop:
665 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
666 elif real_pos > cds_stop:
667 three_total += master_read_dict[tran]["unambig"][readlen][pos]
668 master_read_dict["unambiguous_all_totals"][tran] = (
669 five_total + cds_total + three_total
670 )
671 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
672 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
673 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
674
675 for readlen in master_read_dict[tran]["ambig"]:
676 if readlen in final_offsets["fiveprime"]["offsets"]:
677 offset = final_offsets["fiveprime"]["offsets"][readlen]
678 else:
679 offset = 15
680 for pos in master_read_dict[tran]["ambig"][readlen]:
681 real_pos = pos + offset
682 if real_pos < cds_start:
683 ambig_five_total += master_read_dict[tran]["ambig"][readlen][
684 pos
685 ]
686 elif real_pos >= cds_start and real_pos <= cds_stop:
687 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
688 elif real_pos > cds_stop:
689 ambig_three_total += master_read_dict[tran]["ambig"][readlen][
690 pos
691 ]
692
693 master_read_dict["ambiguous_all_totals"][tran] = (
694 five_total
695 + cds_total
696 + three_total
697 + ambig_five_total
698 + ambig_cds_total
699 + ambig_three_total
700 )
701 master_read_dict["ambiguous_fiveprime_totals"][tran] = (
702 five_total + ambig_five_total
703 )
704 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total
705 master_read_dict["ambiguous_threeprime_totals"][tran] = (
706 three_total + ambig_three_total
707 )
708 print("Writing out to sqlite file")
709 sqlite_db = SqliteDict(outputfile, autocommit=False)
710 for key in master_read_dict:
711 sqlite_db[key] = master_read_dict[key]
712 sqlite_db["description"] = desc
713 sqlite_db.commit()
714 sqlite_db.close()
715
716
717 if __name__ == "__main__":
718 if len(sys.argv) <= 2:
719 print(
720 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>"
721 )
722 sys.exit()
723 bam_filepath = sys.argv[1]
724 annotation_sqlite_filepath = sys.argv[2]
725 try:
726 desc = sys.argv[3]
727 except:
728 desc = bam_filepath.split("/")[-1]
729
730 outputfile = sys.argv[4]
731 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile, desc)