Mercurial > repos > jackcurragh > trips_viz_bam_to_sqlite
comparison trips_bam_to_sqlite/bam_to_sqlite.py @ 4:4ee95ba271a5 draft
Uploaded
author | triasteran |
---|---|
date | Mon, 25 Apr 2022 12:40:36 +0000 |
parents | |
children | 1be4bbe849e3 |
comparison
equal
deleted
inserted
replaced
3:932cdd84d51a | 4:4ee95ba271a5 |
---|---|
1 import sys | |
2 import pysam | |
3 import operator | |
4 import os | |
5 import time | |
6 import sqlite3 | |
7 from sqlitedict import SqliteDict | |
8 | |
9 | |
10 def tran_to_genome(tran, pos, transcriptome_info_dict): | |
11 # print ("tran",list(transcriptome_info_dict)) | |
12 traninfo = transcriptome_info_dict[tran] | |
13 chrom = traninfo["chrom"] | |
14 strand = traninfo["strand"] | |
15 exons = sorted(traninfo["exons"]) | |
16 # print exons | |
17 if strand == "+": | |
18 exon_start = 0 | |
19 for tup in exons: | |
20 exon_start = tup[0] | |
21 exonlen = tup[1] - tup[0] | |
22 if pos > exonlen: | |
23 pos = (pos - exonlen) - 1 | |
24 else: | |
25 break | |
26 genomic_pos = (exon_start + pos) - 1 | |
27 elif strand == "-": | |
28 exon_start = 0 | |
29 for tup in exons[::-1]: | |
30 exon_start = tup[1] | |
31 exonlen = tup[1] - tup[0] | |
32 if pos > exonlen: | |
33 pos = (pos - exonlen) - 1 | |
34 else: | |
35 break | |
36 genomic_pos = (exon_start - pos) + 1 | |
37 return (chrom, genomic_pos) | |
38 | |
39 | |
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript | |
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict | |
42 def processor( | |
43 process_chunk, | |
44 master_read_dict, | |
45 transcriptome_info_dict, | |
46 master_dict, | |
47 readseq, | |
48 unambig_read_length_dict, | |
49 ): | |
50 readlen = len(readseq) | |
51 ambiguously_mapped_reads = 0 | |
52 # get the read name | |
53 read = list(process_chunk)[0] | |
54 | |
55 read_list = process_chunk[ | |
56 read | |
57 ] # a list of lists of all transcripts the read aligns to and the positions | |
58 # used to store different genomic poistions | |
59 genomic_positions = [] | |
60 | |
61 # This section is just to get the different genomic positions the read aligns to | |
62 | |
63 for listname in process_chunk[read]: | |
64 | |
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | |
66 | |
67 pos = int(listname[1]) | |
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) | |
69 # print ("genomic pos",genomic_pos) | |
70 if genomic_pos not in genomic_positions: | |
71 genomic_positions.append(genomic_pos) | |
72 | |
73 # If the read maps unambiguously | |
74 if len(genomic_positions) == 1: | |
75 if readlen not in unambig_read_length_dict: | |
76 unambig_read_length_dict[readlen] = 0 | |
77 unambig_read_length_dict[readlen] += 1 | |
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True | |
79 coding = False | |
80 | |
81 # For each transcript this read alings to | |
82 for listname in process_chunk[read]: | |
83 # get the transcript name | |
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | |
85 # If we haven't come across this transcript already then add to master_read_dict | |
86 if tran not in master_read_dict: | |
87 master_read_dict[tran] = { | |
88 "ambig": {}, | |
89 "unambig": {}, | |
90 "mismatches": {}, | |
91 "seq": {}, | |
92 } | |
93 # get the raw unedited positon, and read tags | |
94 pos = int(listname[1]) | |
95 read_tags = listname[2] | |
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary | |
97 nm_tag = 0 | |
98 | |
99 for tag in read_tags: | |
100 if tag[0] == "NM": | |
101 nm_tag = int(tag[1]) | |
102 if nm_tag > 0: | |
103 md_tag = "" | |
104 for tag in read_tags: | |
105 if tag[0] == "MD": | |
106 md_tag = tag[1] | |
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( | |
108 md_tag, pos, readlen, master_read_dict, tran, readseq | |
109 ) | |
110 # Count the mismatches (we only do this for unambiguous) | |
111 for mismatch in mismatches: | |
112 # Ignore mismatches appearing in the first position (due to non templated addition) | |
113 if mismatch != 0: | |
114 char = mismatches[mismatch] | |
115 mismatch_pos = pos + mismatch | |
116 if mismatch_pos not in master_read_dict[tran]["seq"]: | |
117 master_read_dict[tran]["seq"][mismatch_pos] = {} | |
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]: | |
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0 | |
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1 | |
121 # apply the modifiers | |
122 # pos = pos+pos_modifier | |
123 # readlen = readlen - readlen_modifier | |
124 | |
125 try: | |
126 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
128 | |
129 if pos >= cds_start and pos <= cds_stop: | |
130 coding = True | |
131 except: | |
132 pass | |
133 | |
134 if readlen in master_read_dict[tran]["unambig"]: | |
135 if pos in master_read_dict[tran]["unambig"][readlen]: | |
136 master_read_dict[tran]["unambig"][readlen][pos] += 1 | |
137 else: | |
138 master_read_dict[tran]["unambig"][readlen][pos] = 1 | |
139 else: | |
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1} | |
141 | |
142 if coding == True: | |
143 master_dict["unambiguous_coding_count"] += 1 | |
144 elif coding == False: | |
145 master_dict["unambiguous_non_coding_count"] += 1 | |
146 | |
147 else: | |
148 ambiguously_mapped_reads += 1 | |
149 for listname in process_chunk[read]: | |
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | |
151 if tran not in master_read_dict: | |
152 master_read_dict[tran] = { | |
153 "ambig": {}, | |
154 "unambig": {}, | |
155 "mismatches": {}, | |
156 "seq": {}, | |
157 } | |
158 pos = int(listname[1]) | |
159 read_tags = listname[2] | |
160 nm_tag = 0 | |
161 for tag in read_tags: | |
162 if tag[0] == "NM": | |
163 nm_tag = int(tag[1]) | |
164 if nm_tag > 0: | |
165 md_tag = "" | |
166 for tag in read_tags: | |
167 if tag[0] == "MD": | |
168 md_tag = tag[1] | |
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( | |
170 md_tag, pos, readlen, master_read_dict, tran, readseq | |
171 ) | |
172 # apply the modifiers | |
173 # pos = pos+pos_modifier | |
174 # readlen = readlen - readlen_modifier | |
175 if readlen in master_read_dict[tran]["ambig"]: | |
176 if pos in master_read_dict[tran]["ambig"][readlen]: | |
177 master_read_dict[tran]["ambig"][readlen][pos] += 1 | |
178 else: | |
179 master_read_dict[tran]["ambig"][readlen][pos] = 1 | |
180 else: | |
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1} | |
182 return ambiguously_mapped_reads | |
183 | |
184 | |
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq): | |
186 nucs = ["A", "T", "G", "C"] | |
187 mismatches = {} | |
188 total_so_far = 0 | |
189 prev_char = "" | |
190 for char in md_tag: | |
191 if char in nucs: | |
192 if prev_char != "": | |
193 total_so_far += int(prev_char) | |
194 prev_char = "" | |
195 mismatches[total_so_far + len(mismatches)] = readseq[ | |
196 total_so_far + len(mismatches) | |
197 ] | |
198 else: | |
199 if char != "^" and char != "N": | |
200 if prev_char == "": | |
201 prev_char = char | |
202 else: | |
203 total_so_far += int(prev_char + char) | |
204 prev_char = "" | |
205 readlen_modifier = 0 | |
206 pos_modifier = 0 | |
207 five_ok = False | |
208 three_ok = False | |
209 while five_ok == False: | |
210 for i in range(0, readlen): | |
211 if i in mismatches: | |
212 pos_modifier += 1 | |
213 readlen_modifier += 1 | |
214 else: | |
215 five_ok = True | |
216 break | |
217 five_ok = True | |
218 | |
219 while three_ok == False: | |
220 for i in range(readlen - 1, 0, -1): | |
221 if i in mismatches: | |
222 readlen_modifier += 1 | |
223 else: | |
224 three_ok = True | |
225 break | |
226 three_ok = True | |
227 | |
228 return (pos_modifier, readlen_modifier, mismatches) | |
229 | |
230 | |
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile, desc): | |
232 desc = desc | |
233 start_time = time.time() | |
234 study_dict = {} | |
235 nuc_count_dict = {"mapped": {}, "unmapped": {}} | |
236 dinuc_count_dict = {} | |
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}} | |
238 read_length_dict = {} | |
239 unambig_read_length_dict = {} | |
240 unmapped_dict = {} | |
241 master_dict = { | |
242 "unambiguous_non_coding_count": 0, | |
243 "unambiguous_coding_count": 0, | |
244 "current_dir": os.getcwd(), | |
245 } | |
246 | |
247 transcriptome_info_dict = {} | |
248 connection = sqlite3.connect(transcriptome_info_dict_path) | |
249 cursor = connection.cursor() | |
250 cursor.execute( | |
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;" | |
252 ) | |
253 result = cursor.fetchall() | |
254 for row in result: | |
255 transcriptome_info_dict[str(row[0])] = { | |
256 "cds_start": row[1], | |
257 "cds_stop": row[2], | |
258 "length": row[3], | |
259 "strand": row[4], | |
260 "chrom": row[5], | |
261 "exons": [], | |
262 "tran_type": row[6], | |
263 } | |
264 # print list(transcriptome_info_dict)[:10] | |
265 | |
266 cursor.execute("SELECT * from exons;") | |
267 result = cursor.fetchall() | |
268 for row in result: | |
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2])) | |
270 | |
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers | |
272 multimappers = 0 | |
273 unmapped_reads = 0 | |
274 unambiguous_coding_count = 0 | |
275 unambiguous_non_coding_count = 0 | |
276 trip_periodicity_reads = 0 | |
277 | |
278 final_offsets = { | |
279 "fiveprime": {"offsets": {}, "read_scores": {}}, | |
280 "threeprime": {"offsets": {}, "read_scores": {}}, | |
281 } | |
282 master_read_dict = {} | |
283 prev_seq = "" | |
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]} | |
285 mapped_reads = 0 | |
286 ambiguously_mapped_reads = 0 | |
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}} | |
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}} | |
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}} | |
290 | |
291 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam') | |
292 pysam.set_verbosity(0) | |
293 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") | |
294 header = infile.header["HD"] | |
295 unsorted = False | |
296 if "SO" in header: | |
297 if header["SO"] != "queryname": | |
298 unsorted = True | |
299 else: | |
300 unsorted = True | |
301 if unsorted == True: | |
302 print( | |
303 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam" | |
304 ) | |
305 print(header, bam_filepath) | |
306 sys.exit() | |
307 total_bam_lines = 0 | |
308 all_ref_ids = infile.references | |
309 | |
310 for read in infile.fetch(until_eof=True): | |
311 total_bam_lines += 1 | |
312 if not read.is_unmapped: | |
313 ref = read.reference_id | |
314 tran = (all_ref_ids[ref]).split(".")[0] | |
315 mapped_reads += 1 | |
316 if mapped_reads % 1000000 == 0: | |
317 print( | |
318 "{} reads parsed at {}".format( | |
319 mapped_reads, (time.time() - start_time) | |
320 ) | |
321 ) | |
322 pos = read.reference_start | |
323 readname = read.query_name | |
324 read_tags = read.tags | |
325 if readname == list(process_chunk)[0]: | |
326 process_chunk[readname].append([tran, pos, read_tags]) | |
327 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read | |
328 else: | |
329 if list(process_chunk)[0] != "read_name": | |
330 | |
331 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because | |
332 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) | |
333 seq = read.seq | |
334 readlen = len(seq) | |
335 | |
336 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) | |
337 if readlen not in read_length_dict: | |
338 read_length_dict[readlen] = 0 | |
339 read_length_dict[readlen] += 1 | |
340 | |
341 if readlen not in nuc_count_dict["mapped"]: | |
342 nuc_count_dict["mapped"][readlen] = {} | |
343 if readlen not in threeprime_nuc_count_dict["mapped"]: | |
344 threeprime_nuc_count_dict["mapped"][readlen] = {} | |
345 if readlen not in dinuc_count_dict: | |
346 dinuc_count_dict[readlen] = { | |
347 "AA": 0, | |
348 "TA": 0, | |
349 "GA": 0, | |
350 "CA": 0, | |
351 "AT": 0, | |
352 "TT": 0, | |
353 "GT": 0, | |
354 "CT": 0, | |
355 "AG": 0, | |
356 "TG": 0, | |
357 "GG": 0, | |
358 "CG": 0, | |
359 "AC": 0, | |
360 "TC": 0, | |
361 "GC": 0, | |
362 "CC": 0, | |
363 } | |
364 | |
365 for i in range(0, len(seq)): | |
366 if i not in nuc_count_dict["mapped"][readlen]: | |
367 nuc_count_dict["mapped"][readlen][i] = { | |
368 "A": 0, | |
369 "T": 0, | |
370 "G": 0, | |
371 "C": 0, | |
372 "N": 0, | |
373 } | |
374 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 | |
375 | |
376 for i in range(0, len(seq)): | |
377 try: | |
378 dinuc_count_dict[readlen][seq[i : i + 2]] += 1 | |
379 except: | |
380 pass | |
381 | |
382 for i in range(len(seq), 0, -1): | |
383 dist = i - len(seq) | |
384 if dist not in threeprime_nuc_count_dict["mapped"][readlen]: | |
385 threeprime_nuc_count_dict["mapped"][readlen][dist] = { | |
386 "A": 0, | |
387 "T": 0, | |
388 "G": 0, | |
389 "C": 0, | |
390 "N": 0, | |
391 } | |
392 threeprime_nuc_count_dict["mapped"][readlen][dist][ | |
393 seq[dist] | |
394 ] += 1 | |
395 ambiguously_mapped_reads += processor( | |
396 process_chunk, | |
397 master_read_dict, | |
398 transcriptome_info_dict, | |
399 master_dict, | |
400 prev_seq, | |
401 unambig_read_length_dict, | |
402 ) | |
403 process_chunk = {readname: [[tran, pos, read_tags]]} | |
404 prev_seq = read.seq | |
405 else: | |
406 unmapped_reads += 1 | |
407 | |
408 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. | |
409 seq = read.seq | |
410 readlen = len(seq) | |
411 if seq in unmapped_dict: | |
412 unmapped_dict[seq] += 1 | |
413 else: | |
414 unmapped_dict[seq] = 1 | |
415 | |
416 # Populate the nuc_count_dict with this unmapped read | |
417 if readlen not in nuc_count_dict["unmapped"]: | |
418 nuc_count_dict["unmapped"][readlen] = {} | |
419 for i in range(0, len(seq)): | |
420 if i not in nuc_count_dict["unmapped"][readlen]: | |
421 nuc_count_dict["unmapped"][readlen][i] = { | |
422 "A": 0, | |
423 "T": 0, | |
424 "G": 0, | |
425 "C": 0, | |
426 "N": 0, | |
427 } | |
428 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 | |
429 | |
430 if readlen not in threeprime_nuc_count_dict["unmapped"]: | |
431 threeprime_nuc_count_dict["unmapped"][readlen] = {} | |
432 | |
433 for i in range(len(seq), 0, -1): | |
434 dist = i - len(seq) | |
435 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: | |
436 threeprime_nuc_count_dict["unmapped"][readlen][dist] = { | |
437 "A": 0, | |
438 "T": 0, | |
439 "G": 0, | |
440 "C": 0, | |
441 "N": 0, | |
442 } | |
443 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 | |
444 | |
445 # add stats about mapped/unmapped reads to file dict which will be used for the final report | |
446 master_dict["total_bam_lines"] = total_bam_lines | |
447 master_dict["mapped_reads"] = mapped_reads | |
448 master_dict["unmapped_reads"] = unmapped_reads | |
449 master_read_dict["unmapped_reads"] = unmapped_reads | |
450 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads | |
451 | |
452 if "read_name" in master_read_dict: | |
453 del master_read_dict["read_name"] | |
454 print("BAM file processed") | |
455 print("Creating metagenes, triplet periodicity plots, etc.") | |
456 for tran in master_read_dict: | |
457 try: | |
458 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
459 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
460 except: | |
461 continue | |
462 | |
463 tranlen = transcriptome_info_dict[tran]["length"] | |
464 # Use this to discard transcripts with no 5' leader or 3' trailer | |
465 if ( | |
466 cds_start > 1 | |
467 and cds_stop < tranlen | |
468 and transcriptome_info_dict[tran]["tran_type"] == 1 | |
469 ): | |
470 for primetype in ["fiveprime", "threeprime"]: | |
471 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads | |
472 for readlength in master_read_dict[tran]["unambig"]: | |
473 # print "readlength", readlength | |
474 # for each fiveprime postion for this readlength within this transcript | |
475 for raw_pos in master_read_dict[tran]["unambig"][readlength]: | |
476 # print "raw pos", raw_pos | |
477 trip_periodicity_reads += 1 | |
478 if primetype == "fiveprime": | |
479 # get the five prime postion minus the cds start postion | |
480 real_pos = raw_pos - cds_start | |
481 rel_stop_pos = raw_pos - cds_stop | |
482 elif primetype == "threeprime": | |
483 real_pos = (raw_pos + readlength) - cds_start | |
484 rel_stop_pos = (raw_pos + readlength) - cds_stop | |
485 # get the readcount at the raw postion | |
486 readcount = master_read_dict[tran]["unambig"][readlength][ | |
487 raw_pos | |
488 ] | |
489 # print "readcount", readcount | |
490 frame = real_pos % 3 | |
491 if real_pos >= cds_start and real_pos <= cds_stop: | |
492 if readlength in master_trip_dict[primetype]: | |
493 master_trip_dict[primetype][readlength][ | |
494 str(frame) | |
495 ] += readcount | |
496 else: | |
497 master_trip_dict[primetype][readlength] = { | |
498 "0": 0.0, | |
499 "1": 0.0, | |
500 "2": 0.0, | |
501 } | |
502 master_trip_dict[primetype][readlength][ | |
503 str(frame) | |
504 ] += readcount | |
505 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | |
506 if real_pos > (-600) and real_pos < (601): | |
507 if readlength in master_offset_dict[primetype]: | |
508 if ( | |
509 real_pos | |
510 in master_offset_dict[primetype][readlength] | |
511 ): | |
512 # print "real pos in offset dict" | |
513 master_offset_dict[primetype][readlength][ | |
514 real_pos | |
515 ] += readcount | |
516 else: | |
517 # print "real pos not in offset dict" | |
518 master_offset_dict[primetype][readlength][ | |
519 real_pos | |
520 ] = readcount | |
521 else: | |
522 # initiliase with zero to avoid missing neighbours below | |
523 # print "initialising with zeros" | |
524 master_offset_dict[primetype][readlength] = {} | |
525 for i in range(-600, 601): | |
526 master_offset_dict[primetype][readlength][i] = 0 | |
527 master_offset_dict[primetype][readlength][ | |
528 real_pos | |
529 ] += readcount | |
530 | |
531 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | |
532 if rel_stop_pos > (-600) and rel_stop_pos < (601): | |
533 if readlength in master_metagene_stop_dict[primetype]: | |
534 if ( | |
535 rel_stop_pos | |
536 in master_metagene_stop_dict[primetype][readlength] | |
537 ): | |
538 master_metagene_stop_dict[primetype][readlength][ | |
539 rel_stop_pos | |
540 ] += readcount | |
541 else: | |
542 master_metagene_stop_dict[primetype][readlength][ | |
543 rel_stop_pos | |
544 ] = readcount | |
545 else: | |
546 # initiliase with zero to avoid missing neighbours below | |
547 master_metagene_stop_dict[primetype][readlength] = {} | |
548 for i in range(-600, 601): | |
549 master_metagene_stop_dict[primetype][readlength][ | |
550 i | |
551 ] = 0 | |
552 master_metagene_stop_dict[primetype][readlength][ | |
553 rel_stop_pos | |
554 ] += readcount | |
555 | |
556 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame | |
557 # create a 'score' for each readlength by putting the max frame count over the second highest frame count | |
558 for primetype in ["fiveprime", "threeprime"]: | |
559 for subreadlength in master_trip_dict[primetype]: | |
560 maxcount = 0 | |
561 secondmaxcount = 0 | |
562 for frame in master_trip_dict[primetype][subreadlength]: | |
563 if master_trip_dict[primetype][subreadlength][frame] > maxcount: | |
564 maxcount = master_trip_dict[primetype][subreadlength][frame] | |
565 for frame in master_trip_dict[primetype][subreadlength]: | |
566 if ( | |
567 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount | |
568 and master_trip_dict[primetype][subreadlength][frame] != maxcount | |
569 ): | |
570 secondmaxcount = master_trip_dict[primetype][subreadlength][frame] | |
571 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height | |
572 master_trip_dict[primetype][subreadlength]["score"] = float( | |
573 secondmaxcount | |
574 ) / float(maxcount) | |
575 # This part is to determine what offsets to give each read length | |
576 print("Calculating offsets") | |
577 for primetype in ["fiveprime", "threeprime"]: | |
578 for readlen in master_offset_dict[primetype]: | |
579 accepted_len = False | |
580 max_relative_pos = 0 | |
581 max_relative_count = 0 | |
582 for relative_pos in master_offset_dict[primetype][readlen]: | |
583 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) | |
584 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10): | |
585 continue | |
586 if ( | |
587 master_offset_dict[primetype][readlen][relative_pos] | |
588 > max_relative_count | |
589 ): | |
590 max_relative_pos = relative_pos | |
591 max_relative_count = master_offset_dict[primetype][readlen][ | |
592 relative_pos | |
593 ] | |
594 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) | |
595 if primetype == "fiveprime": | |
596 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall | |
597 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2) | |
598 elif primetype == "threeprime": | |
599 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall | |
600 final_offsets[primetype]["offsets"][readlen] = ( | |
601 max_relative_pos * (-1) | |
602 ) + 2 | |
603 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict | |
604 if readlen in master_trip_dict[primetype]: | |
605 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[ | |
606 primetype | |
607 ][readlen]["score"] | |
608 else: | |
609 final_offsets[primetype]["read_scores"][readlen] = 0.0 | |
610 master_read_dict["offsets"] = final_offsets | |
611 master_read_dict["trip_periodicity"] = master_trip_dict | |
612 master_read_dict["desc"] = "Null" | |
613 master_read_dict["mapped_reads"] = mapped_reads | |
614 master_read_dict["nuc_counts"] = nuc_count_dict | |
615 master_read_dict["dinuc_counts"] = dinuc_count_dict | |
616 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict | |
617 master_read_dict["metagene_counts"] = master_offset_dict | |
618 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict | |
619 master_read_dict["read_lengths"] = read_length_dict | |
620 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict | |
621 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] | |
622 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] | |
623 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] | |
624 master_read_dict["frequent_unmapped_reads"] = ( | |
625 sorted(unmapped_dict.items(), key=operator.itemgetter(1)) | |
626 )[-2000:] | |
627 master_read_dict["cutadapt_removed"] = 0 | |
628 master_read_dict["rrna_removed"] = 0 | |
629 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line | |
630 master_read_dict["removed_minus_m"] = 0 | |
631 master_dict["removed_minus_m"] = 0 | |
632 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets | |
633 master_read_dict["unambiguous_all_totals"] = {} | |
634 master_read_dict["unambiguous_fiveprime_totals"] = {} | |
635 master_read_dict["unambiguous_cds_totals"] = {} | |
636 master_read_dict["unambiguous_threeprime_totals"] = {} | |
637 | |
638 master_read_dict["ambiguous_all_totals"] = {} | |
639 master_read_dict["ambiguous_fiveprime_totals"] = {} | |
640 master_read_dict["ambiguous_cds_totals"] = {} | |
641 master_read_dict["ambiguous_threeprime_totals"] = {} | |
642 print("calculating transcript counts") | |
643 for tran in master_read_dict: | |
644 if tran in transcriptome_info_dict: | |
645 five_total = 0 | |
646 cds_total = 0 | |
647 three_total = 0 | |
648 | |
649 ambig_five_total = 0 | |
650 ambig_cds_total = 0 | |
651 ambig_three_total = 0 | |
652 | |
653 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
654 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
655 for readlen in master_read_dict[tran]["unambig"]: | |
656 if readlen in final_offsets["fiveprime"]["offsets"]: | |
657 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
658 else: | |
659 offset = 15 | |
660 for pos in master_read_dict[tran]["unambig"][readlen]: | |
661 real_pos = pos + offset | |
662 if real_pos < cds_start: | |
663 five_total += master_read_dict[tran]["unambig"][readlen][pos] | |
664 elif real_pos >= cds_start and real_pos <= cds_stop: | |
665 cds_total += master_read_dict[tran]["unambig"][readlen][pos] | |
666 elif real_pos > cds_stop: | |
667 three_total += master_read_dict[tran]["unambig"][readlen][pos] | |
668 master_read_dict["unambiguous_all_totals"][tran] = ( | |
669 five_total + cds_total + three_total | |
670 ) | |
671 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total | |
672 master_read_dict["unambiguous_cds_totals"][tran] = cds_total | |
673 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total | |
674 | |
675 for readlen in master_read_dict[tran]["ambig"]: | |
676 if readlen in final_offsets["fiveprime"]["offsets"]: | |
677 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
678 else: | |
679 offset = 15 | |
680 for pos in master_read_dict[tran]["ambig"][readlen]: | |
681 real_pos = pos + offset | |
682 if real_pos < cds_start: | |
683 ambig_five_total += master_read_dict[tran]["ambig"][readlen][ | |
684 pos | |
685 ] | |
686 elif real_pos >= cds_start and real_pos <= cds_stop: | |
687 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] | |
688 elif real_pos > cds_stop: | |
689 ambig_three_total += master_read_dict[tran]["ambig"][readlen][ | |
690 pos | |
691 ] | |
692 | |
693 master_read_dict["ambiguous_all_totals"][tran] = ( | |
694 five_total | |
695 + cds_total | |
696 + three_total | |
697 + ambig_five_total | |
698 + ambig_cds_total | |
699 + ambig_three_total | |
700 ) | |
701 master_read_dict["ambiguous_fiveprime_totals"][tran] = ( | |
702 five_total + ambig_five_total | |
703 ) | |
704 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total | |
705 master_read_dict["ambiguous_threeprime_totals"][tran] = ( | |
706 three_total + ambig_three_total | |
707 ) | |
708 print("Writing out to sqlite file") | |
709 sqlite_db = SqliteDict(outputfile, autocommit=False) | |
710 for key in master_read_dict: | |
711 sqlite_db[key] = master_read_dict[key] | |
712 sqlite_db["description"] = desc | |
713 sqlite_db.commit() | |
714 sqlite_db.close() | |
715 | |
716 | |
717 if __name__ == "__main__": | |
718 if len(sys.argv) <= 2: | |
719 print( | |
720 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>" | |
721 ) | |
722 sys.exit() | |
723 bam_filepath = sys.argv[1] | |
724 annotation_sqlite_filepath = sys.argv[2] | |
725 try: | |
726 desc = sys.argv[3] | |
727 except: | |
728 desc = bam_filepath.split("/")[-1] | |
729 | |
730 outputfile = sys.argv[4] | |
731 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile, desc) |