0
|
1 import sys
|
|
2 import pysam
|
|
3 import operator
|
|
4 import os
|
|
5 import time
|
|
6 import sqlite3
|
|
7 from sqlitedict import SqliteDict
|
|
8
|
|
9
|
|
10 def tran_to_genome(tran, pos, transcriptome_info_dict):
|
|
11 # print ("tran",list(transcriptome_info_dict))
|
|
12 traninfo = transcriptome_info_dict[tran]
|
|
13 chrom = traninfo["chrom"]
|
|
14 strand = traninfo["strand"]
|
|
15 exons = sorted(traninfo["exons"])
|
|
16 # print exons
|
|
17 if strand == "+":
|
|
18 exon_start = 0
|
|
19 for tup in exons:
|
|
20 exon_start = tup[0]
|
|
21 exonlen = tup[1] - tup[0]
|
|
22 if pos > exonlen:
|
|
23 pos = (pos - exonlen) - 1
|
|
24 else:
|
|
25 break
|
|
26 genomic_pos = (exon_start + pos) - 1
|
|
27 elif strand == "-":
|
|
28 exon_start = 0
|
|
29 for tup in exons[::-1]:
|
|
30 exon_start = tup[1]
|
|
31 exonlen = tup[1] - tup[0]
|
|
32 if pos > exonlen:
|
|
33 pos = (pos - exonlen) - 1
|
|
34 else:
|
|
35 break
|
|
36 genomic_pos = (exon_start - pos) + 1
|
|
37 return (chrom, genomic_pos)
|
|
38
|
|
39
|
|
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
|
|
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
|
|
42 def processor(
|
|
43 process_chunk,
|
|
44 master_read_dict,
|
|
45 transcriptome_info_dict,
|
|
46 master_dict,
|
|
47 readseq,
|
|
48 unambig_read_length_dict,
|
|
49 ):
|
|
50 readlen = len(readseq)
|
|
51 ambiguously_mapped_reads = 0
|
|
52 # get the read name
|
|
53 read = list(process_chunk)[0]
|
|
54
|
|
55 read_list = process_chunk[
|
|
56 read
|
|
57 ] # a list of lists of all transcripts the read aligns to and the positions
|
|
58 # used to store different genomic poistions
|
|
59 genomic_positions = []
|
|
60
|
|
61 # This section is just to get the different genomic positions the read aligns to
|
|
62
|
|
63 for listname in process_chunk[read]:
|
|
64
|
|
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
|
|
66
|
|
67 pos = int(listname[1])
|
|
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
|
|
69 # print ("genomic pos",genomic_pos)
|
|
70 if genomic_pos not in genomic_positions:
|
|
71 genomic_positions.append(genomic_pos)
|
|
72
|
|
73 # If the read maps unambiguously
|
|
74 if len(genomic_positions) == 1:
|
|
75 if readlen not in unambig_read_length_dict:
|
|
76 unambig_read_length_dict[readlen] = 0
|
|
77 unambig_read_length_dict[readlen] += 1
|
|
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
|
|
79 coding = False
|
|
80
|
|
81 # For each transcript this read alings to
|
|
82 for listname in process_chunk[read]:
|
|
83 # get the transcript name
|
|
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
|
|
85 # If we haven't come across this transcript already then add to master_read_dict
|
|
86 if tran not in master_read_dict:
|
|
87 master_read_dict[tran] = {
|
|
88 "ambig": {},
|
|
89 "unambig": {},
|
|
90 "mismatches": {},
|
|
91 "seq": {},
|
|
92 }
|
|
93 # get the raw unedited positon, and read tags
|
|
94 pos = int(listname[1])
|
|
95 read_tags = listname[2]
|
|
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
|
|
97 nm_tag = 0
|
|
98
|
|
99 for tag in read_tags:
|
|
100 if tag[0] == "NM":
|
|
101 nm_tag = int(tag[1])
|
|
102 if nm_tag > 0:
|
|
103 md_tag = ""
|
|
104 for tag in read_tags:
|
|
105 if tag[0] == "MD":
|
|
106 md_tag = tag[1]
|
|
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
|
|
108 md_tag, pos, readlen, master_read_dict, tran, readseq
|
|
109 )
|
|
110 # Count the mismatches (we only do this for unambiguous)
|
|
111 for mismatch in mismatches:
|
|
112 # Ignore mismatches appearing in the first position (due to non templated addition)
|
|
113 if mismatch != 0:
|
|
114 char = mismatches[mismatch]
|
|
115 mismatch_pos = pos + mismatch
|
|
116 if mismatch_pos not in master_read_dict[tran]["seq"]:
|
|
117 master_read_dict[tran]["seq"][mismatch_pos] = {}
|
|
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
|
|
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
|
|
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
|
|
121 # apply the modifiers
|
|
122 # pos = pos+pos_modifier
|
|
123 # readlen = readlen - readlen_modifier
|
|
124
|
|
125 try:
|
|
126 cds_start = transcriptome_info_dict[tran]["cds_start"]
|
|
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
|
|
128
|
|
129 if pos >= cds_start and pos <= cds_stop:
|
|
130 coding = True
|
|
131 except:
|
|
132 pass
|
|
133
|
|
134 if readlen in master_read_dict[tran]["unambig"]:
|
|
135 if pos in master_read_dict[tran]["unambig"][readlen]:
|
|
136 master_read_dict[tran]["unambig"][readlen][pos] += 1
|
|
137 else:
|
|
138 master_read_dict[tran]["unambig"][readlen][pos] = 1
|
|
139 else:
|
|
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1}
|
|
141
|
|
142 if coding == True:
|
|
143 master_dict["unambiguous_coding_count"] += 1
|
|
144 elif coding == False:
|
|
145 master_dict["unambiguous_non_coding_count"] += 1
|
|
146
|
|
147 else:
|
|
148 ambiguously_mapped_reads += 1
|
|
149 for listname in process_chunk[read]:
|
|
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
|
|
151 if tran not in master_read_dict:
|
|
152 master_read_dict[tran] = {
|
|
153 "ambig": {},
|
|
154 "unambig": {},
|
|
155 "mismatches": {},
|
|
156 "seq": {},
|
|
157 }
|
|
158 pos = int(listname[1])
|
|
159 read_tags = listname[2]
|
|
160 nm_tag = 0
|
|
161 for tag in read_tags:
|
|
162 if tag[0] == "NM":
|
|
163 nm_tag = int(tag[1])
|
|
164 if nm_tag > 0:
|
|
165 md_tag = ""
|
|
166 for tag in read_tags:
|
|
167 if tag[0] == "MD":
|
|
168 md_tag = tag[1]
|
|
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
|
|
170 md_tag, pos, readlen, master_read_dict, tran, readseq
|
|
171 )
|
|
172 # apply the modifiers
|
|
173 # pos = pos+pos_modifier
|
|
174 # readlen = readlen - readlen_modifier
|
|
175 if readlen in master_read_dict[tran]["ambig"]:
|
|
176 if pos in master_read_dict[tran]["ambig"][readlen]:
|
|
177 master_read_dict[tran]["ambig"][readlen][pos] += 1
|
|
178 else:
|
|
179 master_read_dict[tran]["ambig"][readlen][pos] = 1
|
|
180 else:
|
|
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1}
|
|
182 return ambiguously_mapped_reads
|
|
183
|
|
184
|
|
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq):
|
|
186 nucs = ["A", "T", "G", "C"]
|
|
187 mismatches = {}
|
|
188 total_so_far = 0
|
|
189 prev_char = ""
|
|
190 for char in md_tag:
|
|
191 if char in nucs:
|
|
192 if prev_char != "":
|
|
193 total_so_far += int(prev_char)
|
|
194 prev_char = ""
|
|
195 mismatches[total_so_far + len(mismatches)] = readseq[
|
|
196 total_so_far + len(mismatches)
|
|
197 ]
|
|
198 else:
|
|
199 if char != "^" and char != "N":
|
|
200 if prev_char == "":
|
|
201 prev_char = char
|
|
202 else:
|
|
203 total_so_far += int(prev_char + char)
|
|
204 prev_char = ""
|
|
205 readlen_modifier = 0
|
|
206 pos_modifier = 0
|
|
207 five_ok = False
|
|
208 three_ok = False
|
|
209 while five_ok == False:
|
|
210 for i in range(0, readlen):
|
|
211 if i in mismatches:
|
|
212 pos_modifier += 1
|
|
213 readlen_modifier += 1
|
|
214 else:
|
|
215 five_ok = True
|
|
216 break
|
|
217 five_ok = True
|
|
218
|
|
219 while three_ok == False:
|
|
220 for i in range(readlen - 1, 0, -1):
|
|
221 if i in mismatches:
|
|
222 readlen_modifier += 1
|
|
223 else:
|
|
224 three_ok = True
|
|
225 break
|
|
226 three_ok = True
|
|
227
|
|
228 return (pos_modifier, readlen_modifier, mismatches)
|
|
229
|
|
230
|
|
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile):
|
|
232 desc = "NULL"
|
|
233 start_time = time.time()
|
|
234 study_dict = {}
|
|
235 nuc_count_dict = {"mapped": {}, "unmapped": {}}
|
|
236 dinuc_count_dict = {}
|
|
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}}
|
|
238 read_length_dict = {}
|
|
239 unambig_read_length_dict = {}
|
|
240 unmapped_dict = {}
|
|
241 master_dict = {
|
|
242 "unambiguous_non_coding_count": 0,
|
|
243 "unambiguous_coding_count": 0,
|
|
244 "current_dir": os.getcwd(),
|
|
245 }
|
|
246
|
|
247 transcriptome_info_dict = {}
|
|
248 connection = sqlite3.connect(transcriptome_info_dict_path)
|
|
249 cursor = connection.cursor()
|
|
250 cursor.execute(
|
|
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;"
|
|
252 )
|
|
253 result = cursor.fetchall()
|
|
254 for row in result:
|
|
255 transcriptome_info_dict[str(row[0])] = {
|
|
256 "cds_start": row[1],
|
|
257 "cds_stop": row[2],
|
|
258 "length": row[3],
|
|
259 "strand": row[4],
|
|
260 "chrom": row[5],
|
|
261 "exons": [],
|
|
262 "tran_type": row[6],
|
|
263 }
|
|
264 # print list(transcriptome_info_dict)[:10]
|
|
265
|
|
266 cursor.execute("SELECT * from exons;")
|
|
267 result = cursor.fetchall()
|
|
268 for row in result:
|
|
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2]))
|
|
270
|
|
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
|
|
272 multimappers = 0
|
|
273 unmapped_reads = 0
|
|
274 unambiguous_coding_count = 0
|
|
275 unambiguous_non_coding_count = 0
|
|
276 trip_periodicity_reads = 0
|
|
277
|
|
278 final_offsets = {
|
|
279 "fiveprime": {"offsets": {}, "read_scores": {}},
|
|
280 "threeprime": {"offsets": {}, "read_scores": {}},
|
|
281 }
|
|
282 master_read_dict = {}
|
|
283 prev_seq = ""
|
|
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]}
|
|
285 mapped_reads = 0
|
|
286 ambiguously_mapped_reads = 0
|
|
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}}
|
|
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}}
|
|
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}}
|
|
290
|
|
291 infile = pysam.Samfile(bam_filepath, "rb")
|
|
292 header = infile.header["HD"]
|
|
293 unsorted = False
|
|
294 if "SO" in header:
|
|
295 if header["SO"] != "queryname":
|
|
296 unsorted = True
|
|
297 else:
|
|
298 unsorted = True
|
|
299 if unsorted == True:
|
|
300 print(
|
|
301 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam"
|
|
302 )
|
|
303 print(header, bam_filepath)
|
|
304 sys.exit()
|
|
305 total_bam_lines = 0
|
|
306 all_ref_ids = infile.references
|
|
307
|
|
308 for read in infile.fetch(until_eof=True):
|
|
309 total_bam_lines += 1
|
|
310 if not read.is_unmapped:
|
|
311 ref = read.reference_id
|
|
312 tran = (all_ref_ids[ref]).split(".")[0]
|
|
313 mapped_reads += 1
|
|
314 if mapped_reads % 1000000 == 0:
|
|
315 print(
|
|
316 "{} reads parsed at {}".format(
|
|
317 mapped_reads, (time.time() - start_time)
|
|
318 )
|
|
319 )
|
|
320 pos = read.reference_start
|
|
321 readname = read.query_name
|
|
322 read_tags = read.tags
|
|
323 if readname == list(process_chunk)[0]:
|
|
324 process_chunk[readname].append([tran, pos, read_tags])
|
|
325 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
|
|
326 else:
|
|
327 if list(process_chunk)[0] != "read_name":
|
|
328
|
|
329 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
|
|
330 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
|
|
331 seq = read.seq
|
|
332 readlen = len(seq)
|
|
333
|
|
334 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
|
|
335 if readlen not in read_length_dict:
|
|
336 read_length_dict[readlen] = 0
|
|
337 read_length_dict[readlen] += 1
|
|
338
|
|
339 if readlen not in nuc_count_dict["mapped"]:
|
|
340 nuc_count_dict["mapped"][readlen] = {}
|
|
341 if readlen not in threeprime_nuc_count_dict["mapped"]:
|
|
342 threeprime_nuc_count_dict["mapped"][readlen] = {}
|
|
343 if readlen not in dinuc_count_dict:
|
|
344 dinuc_count_dict[readlen] = {
|
|
345 "AA": 0,
|
|
346 "TA": 0,
|
|
347 "GA": 0,
|
|
348 "CA": 0,
|
|
349 "AT": 0,
|
|
350 "TT": 0,
|
|
351 "GT": 0,
|
|
352 "CT": 0,
|
|
353 "AG": 0,
|
|
354 "TG": 0,
|
|
355 "GG": 0,
|
|
356 "CG": 0,
|
|
357 "AC": 0,
|
|
358 "TC": 0,
|
|
359 "GC": 0,
|
|
360 "CC": 0,
|
|
361 }
|
|
362
|
|
363 for i in range(0, len(seq)):
|
|
364 if i not in nuc_count_dict["mapped"][readlen]:
|
|
365 nuc_count_dict["mapped"][readlen][i] = {
|
|
366 "A": 0,
|
|
367 "T": 0,
|
|
368 "G": 0,
|
|
369 "C": 0,
|
|
370 "N": 0,
|
|
371 }
|
|
372 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
|
|
373
|
|
374 for i in range(0, len(seq)):
|
|
375 try:
|
|
376 dinuc_count_dict[readlen][seq[i : i + 2]] += 1
|
|
377 except:
|
|
378 pass
|
|
379
|
|
380 for i in range(len(seq), 0, -1):
|
|
381 dist = i - len(seq)
|
|
382 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
|
|
383 threeprime_nuc_count_dict["mapped"][readlen][dist] = {
|
|
384 "A": 0,
|
|
385 "T": 0,
|
|
386 "G": 0,
|
|
387 "C": 0,
|
|
388 "N": 0,
|
|
389 }
|
|
390 threeprime_nuc_count_dict["mapped"][readlen][dist][
|
|
391 seq[dist]
|
|
392 ] += 1
|
|
393 ambiguously_mapped_reads += processor(
|
|
394 process_chunk,
|
|
395 master_read_dict,
|
|
396 transcriptome_info_dict,
|
|
397 master_dict,
|
|
398 prev_seq,
|
|
399 unambig_read_length_dict,
|
|
400 )
|
|
401 process_chunk = {readname: [[tran, pos, read_tags]]}
|
|
402 prev_seq = read.seq
|
|
403 else:
|
|
404 unmapped_reads += 1
|
|
405
|
|
406 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
|
|
407 seq = read.seq
|
|
408 readlen = len(seq)
|
|
409 if seq in unmapped_dict:
|
|
410 unmapped_dict[seq] += 1
|
|
411 else:
|
|
412 unmapped_dict[seq] = 1
|
|
413
|
|
414 # Populate the nuc_count_dict with this unmapped read
|
|
415 if readlen not in nuc_count_dict["unmapped"]:
|
|
416 nuc_count_dict["unmapped"][readlen] = {}
|
|
417 for i in range(0, len(seq)):
|
|
418 if i not in nuc_count_dict["unmapped"][readlen]:
|
|
419 nuc_count_dict["unmapped"][readlen][i] = {
|
|
420 "A": 0,
|
|
421 "T": 0,
|
|
422 "G": 0,
|
|
423 "C": 0,
|
|
424 "N": 0,
|
|
425 }
|
|
426 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
|
|
427
|
|
428 if readlen not in threeprime_nuc_count_dict["unmapped"]:
|
|
429 threeprime_nuc_count_dict["unmapped"][readlen] = {}
|
|
430
|
|
431 for i in range(len(seq), 0, -1):
|
|
432 dist = i - len(seq)
|
|
433 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
|
|
434 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {
|
|
435 "A": 0,
|
|
436 "T": 0,
|
|
437 "G": 0,
|
|
438 "C": 0,
|
|
439 "N": 0,
|
|
440 }
|
|
441 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
|
|
442
|
|
443 # add stats about mapped/unmapped reads to file dict which will be used for the final report
|
|
444 master_dict["total_bam_lines"] = total_bam_lines
|
|
445 master_dict["mapped_reads"] = mapped_reads
|
|
446 master_dict["unmapped_reads"] = unmapped_reads
|
|
447 master_read_dict["unmapped_reads"] = unmapped_reads
|
|
448 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
|
|
449
|
|
450 if "read_name" in master_read_dict:
|
|
451 del master_read_dict["read_name"]
|
|
452 print("BAM file processed")
|
|
453 print("Creating metagenes, triplet periodicity plots, etc.")
|
|
454 for tran in master_read_dict:
|
|
455 try:
|
|
456 cds_start = transcriptome_info_dict[tran]["cds_start"]
|
|
457 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
|
|
458 except:
|
|
459 continue
|
|
460
|
|
461 tranlen = transcriptome_info_dict[tran]["length"]
|
|
462 # Use this to discard transcripts with no 5' leader or 3' trailer
|
|
463 if (
|
|
464 cds_start > 1
|
|
465 and cds_stop < tranlen
|
|
466 and transcriptome_info_dict[tran]["tran_type"] == 1
|
|
467 ):
|
|
468 for primetype in ["fiveprime", "threeprime"]:
|
|
469 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
|
|
470 for readlength in master_read_dict[tran]["unambig"]:
|
|
471 # print "readlength", readlength
|
|
472 # for each fiveprime postion for this readlength within this transcript
|
|
473 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
|
|
474 # print "raw pos", raw_pos
|
|
475 trip_periodicity_reads += 1
|
|
476 if primetype == "fiveprime":
|
|
477 # get the five prime postion minus the cds start postion
|
|
478 real_pos = raw_pos - cds_start
|
|
479 rel_stop_pos = raw_pos - cds_stop
|
|
480 elif primetype == "threeprime":
|
|
481 real_pos = (raw_pos + readlength) - cds_start
|
|
482 rel_stop_pos = (raw_pos + readlength) - cds_stop
|
|
483 # get the readcount at the raw postion
|
|
484 readcount = master_read_dict[tran]["unambig"][readlength][
|
|
485 raw_pos
|
|
486 ]
|
|
487 # print "readcount", readcount
|
|
488 frame = real_pos % 3
|
|
489 if real_pos >= cds_start and real_pos <= cds_stop:
|
|
490 if readlength in master_trip_dict[primetype]:
|
|
491 master_trip_dict[primetype][readlength][
|
|
492 str(frame)
|
|
493 ] += readcount
|
|
494 else:
|
|
495 master_trip_dict[primetype][readlength] = {
|
|
496 "0": 0.0,
|
|
497 "1": 0.0,
|
|
498 "2": 0.0,
|
|
499 }
|
|
500 master_trip_dict[primetype][readlength][
|
|
501 str(frame)
|
|
502 ] += readcount
|
|
503 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
|
|
504 if real_pos > (-600) and real_pos < (601):
|
|
505 if readlength in master_offset_dict[primetype]:
|
|
506 if (
|
|
507 real_pos
|
|
508 in master_offset_dict[primetype][readlength]
|
|
509 ):
|
|
510 # print "real pos in offset dict"
|
|
511 master_offset_dict[primetype][readlength][
|
|
512 real_pos
|
|
513 ] += readcount
|
|
514 else:
|
|
515 # print "real pos not in offset dict"
|
|
516 master_offset_dict[primetype][readlength][
|
|
517 real_pos
|
|
518 ] = readcount
|
|
519 else:
|
|
520 # initiliase with zero to avoid missing neighbours below
|
|
521 # print "initialising with zeros"
|
|
522 master_offset_dict[primetype][readlength] = {}
|
|
523 for i in range(-600, 601):
|
|
524 master_offset_dict[primetype][readlength][i] = 0
|
|
525 master_offset_dict[primetype][readlength][
|
|
526 real_pos
|
|
527 ] += readcount
|
|
528
|
|
529 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
|
|
530 if rel_stop_pos > (-600) and rel_stop_pos < (601):
|
|
531 if readlength in master_metagene_stop_dict[primetype]:
|
|
532 if (
|
|
533 rel_stop_pos
|
|
534 in master_metagene_stop_dict[primetype][readlength]
|
|
535 ):
|
|
536 master_metagene_stop_dict[primetype][readlength][
|
|
537 rel_stop_pos
|
|
538 ] += readcount
|
|
539 else:
|
|
540 master_metagene_stop_dict[primetype][readlength][
|
|
541 rel_stop_pos
|
|
542 ] = readcount
|
|
543 else:
|
|
544 # initiliase with zero to avoid missing neighbours below
|
|
545 master_metagene_stop_dict[primetype][readlength] = {}
|
|
546 for i in range(-600, 601):
|
|
547 master_metagene_stop_dict[primetype][readlength][
|
|
548 i
|
|
549 ] = 0
|
|
550 master_metagene_stop_dict[primetype][readlength][
|
|
551 rel_stop_pos
|
|
552 ] += readcount
|
|
553
|
|
554 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
|
|
555 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
|
|
556 for primetype in ["fiveprime", "threeprime"]:
|
|
557 for subreadlength in master_trip_dict[primetype]:
|
|
558 maxcount = 0
|
|
559 secondmaxcount = 0
|
|
560 for frame in master_trip_dict[primetype][subreadlength]:
|
|
561 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
|
|
562 maxcount = master_trip_dict[primetype][subreadlength][frame]
|
|
563 for frame in master_trip_dict[primetype][subreadlength]:
|
|
564 if (
|
|
565 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount
|
|
566 and master_trip_dict[primetype][subreadlength][frame] != maxcount
|
|
567 ):
|
|
568 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
|
|
569 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
|
|
570 master_trip_dict[primetype][subreadlength]["score"] = float(
|
|
571 secondmaxcount
|
|
572 ) / float(maxcount)
|
|
573 # This part is to determine what offsets to give each read length
|
|
574 print("Calculating offsets")
|
|
575 for primetype in ["fiveprime", "threeprime"]:
|
|
576 for readlen in master_offset_dict[primetype]:
|
|
577 accepted_len = False
|
|
578 max_relative_pos = 0
|
|
579 max_relative_count = 0
|
|
580 for relative_pos in master_offset_dict[primetype][readlen]:
|
|
581 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
|
|
582 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10):
|
|
583 continue
|
|
584 if (
|
|
585 master_offset_dict[primetype][readlen][relative_pos]
|
|
586 > max_relative_count
|
|
587 ):
|
|
588 max_relative_pos = relative_pos
|
|
589 max_relative_count = master_offset_dict[primetype][readlen][
|
|
590 relative_pos
|
|
591 ]
|
|
592 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
|
|
593 if primetype == "fiveprime":
|
|
594 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
|
|
595 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2)
|
|
596 elif primetype == "threeprime":
|
|
597 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
|
|
598 final_offsets[primetype]["offsets"][readlen] = (
|
|
599 max_relative_pos * (-1)
|
|
600 ) + 2
|
|
601 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
|
|
602 if readlen in master_trip_dict[primetype]:
|
|
603 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[
|
|
604 primetype
|
|
605 ][readlen]["score"]
|
|
606 else:
|
|
607 final_offsets[primetype]["read_scores"][readlen] = 0.0
|
|
608 master_read_dict["offsets"] = final_offsets
|
|
609 master_read_dict["trip_periodicity"] = master_trip_dict
|
|
610 master_read_dict["desc"] = "Null"
|
|
611 master_read_dict["mapped_reads"] = mapped_reads
|
|
612 master_read_dict["nuc_counts"] = nuc_count_dict
|
|
613 master_read_dict["dinuc_counts"] = dinuc_count_dict
|
|
614 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
|
|
615 master_read_dict["metagene_counts"] = master_offset_dict
|
|
616 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
|
|
617 master_read_dict["read_lengths"] = read_length_dict
|
|
618 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
|
|
619 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
|
|
620 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
|
|
621 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
|
|
622 master_read_dict["frequent_unmapped_reads"] = (
|
|
623 sorted(unmapped_dict.items(), key=operator.itemgetter(1))
|
|
624 )[-2000:]
|
|
625 master_read_dict["cutadapt_removed"] = 0
|
|
626 master_read_dict["rrna_removed"] = 0
|
|
627 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
|
|
628 master_read_dict["removed_minus_m"] = 0
|
|
629 master_dict["removed_minus_m"] = 0
|
|
630 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
|
|
631 master_read_dict["unambiguous_all_totals"] = {}
|
|
632 master_read_dict["unambiguous_fiveprime_totals"] = {}
|
|
633 master_read_dict["unambiguous_cds_totals"] = {}
|
|
634 master_read_dict["unambiguous_threeprime_totals"] = {}
|
|
635
|
|
636 master_read_dict["ambiguous_all_totals"] = {}
|
|
637 master_read_dict["ambiguous_fiveprime_totals"] = {}
|
|
638 master_read_dict["ambiguous_cds_totals"] = {}
|
|
639 master_read_dict["ambiguous_threeprime_totals"] = {}
|
|
640 print("calculating transcript counts")
|
|
641 for tran in master_read_dict:
|
|
642 if tran in transcriptome_info_dict:
|
|
643 five_total = 0
|
|
644 cds_total = 0
|
|
645 three_total = 0
|
|
646
|
|
647 ambig_five_total = 0
|
|
648 ambig_cds_total = 0
|
|
649 ambig_three_total = 0
|
|
650
|
|
651 cds_start = transcriptome_info_dict[tran]["cds_start"]
|
|
652 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
|
|
653 for readlen in master_read_dict[tran]["unambig"]:
|
|
654 if readlen in final_offsets["fiveprime"]["offsets"]:
|
|
655 offset = final_offsets["fiveprime"]["offsets"][readlen]
|
|
656 else:
|
|
657 offset = 15
|
|
658 for pos in master_read_dict[tran]["unambig"][readlen]:
|
|
659 real_pos = pos + offset
|
|
660 if real_pos < cds_start:
|
|
661 five_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
662 elif real_pos >= cds_start and real_pos <= cds_stop:
|
|
663 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
664 elif real_pos > cds_stop:
|
|
665 three_total += master_read_dict[tran]["unambig"][readlen][pos]
|
|
666 master_read_dict["unambiguous_all_totals"][tran] = (
|
|
667 five_total + cds_total + three_total
|
|
668 )
|
|
669 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
|
|
670 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
|
|
671 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
|
|
672
|
|
673 for readlen in master_read_dict[tran]["ambig"]:
|
|
674 if readlen in final_offsets["fiveprime"]["offsets"]:
|
|
675 offset = final_offsets["fiveprime"]["offsets"][readlen]
|
|
676 else:
|
|
677 offset = 15
|
|
678 for pos in master_read_dict[tran]["ambig"][readlen]:
|
|
679 real_pos = pos + offset
|
|
680 if real_pos < cds_start:
|
|
681 ambig_five_total += master_read_dict[tran]["ambig"][readlen][
|
|
682 pos
|
|
683 ]
|
|
684 elif real_pos >= cds_start and real_pos <= cds_stop:
|
|
685 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
|
|
686 elif real_pos > cds_stop:
|
|
687 ambig_three_total += master_read_dict[tran]["ambig"][readlen][
|
|
688 pos
|
|
689 ]
|
|
690
|
|
691 master_read_dict["ambiguous_all_totals"][tran] = (
|
|
692 five_total
|
|
693 + cds_total
|
|
694 + three_total
|
|
695 + ambig_five_total
|
|
696 + ambig_cds_total
|
|
697 + ambig_three_total
|
|
698 )
|
|
699 master_read_dict["ambiguous_fiveprime_totals"][tran] = (
|
|
700 five_total + ambig_five_total
|
|
701 )
|
|
702 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total
|
|
703 master_read_dict["ambiguous_threeprime_totals"][tran] = (
|
|
704 three_total + ambig_three_total
|
|
705 )
|
|
706 print("Writing out to sqlite file")
|
|
707 sqlite_db = SqliteDict(outputfile, autocommit=False)
|
|
708 for key in master_read_dict:
|
|
709 sqlite_db[key] = master_read_dict[key]
|
|
710 sqlite_db["description"] = desc
|
|
711 sqlite_db.commit()
|
|
712 sqlite_db.close()
|
|
713
|
|
714
|
|
715 if __name__ == "__main__":
|
|
716 if len(sys.argv) <= 2:
|
|
717 print(
|
|
718 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>"
|
|
719 )
|
|
720 sys.exit()
|
|
721 bam_filepath = sys.argv[1]
|
|
722 annotation_sqlite_filepath = sys.argv[2]
|
|
723 # try:
|
|
724 # desc = sys.argv[3]
|
|
725 # except:
|
|
726 # desc = bam_filepath.split("/")[-1]
|
|
727 outputfile = bam_filepath + "v2.sqlite"
|
|
728 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile)
|