Mercurial > repos > jackcurragh > ribogalaxy_bowtie_rrna
comparison trips_bam_to_sqlite/bam_to_sqlite.py @ 30:a511e084e4e7 draft
Uploaded
| author | jackcurragh |
|---|---|
| date | Thu, 03 Nov 2022 13:22:00 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 29:104d6756bcbc | 30:a511e084e4e7 |
|---|---|
| 1 import sys | |
| 2 import pysam | |
| 3 import operator | |
| 4 import os | |
| 5 import time | |
| 6 import sqlite3 | |
| 7 from sqlitedict import SqliteDict | |
| 8 | |
| 9 def tran_to_genome(tran, pos, transcriptome_info_dict): | |
| 10 #print ("tran",list(transcriptome_info_dict)) | |
| 11 traninfo = transcriptome_info_dict[tran] | |
| 12 chrom = traninfo["chrom"] | |
| 13 strand = traninfo["strand"] | |
| 14 exons = sorted(traninfo["exons"]) | |
| 15 #print exons | |
| 16 if strand == "+": | |
| 17 exon_start = 0 | |
| 18 for tup in exons: | |
| 19 exon_start = tup[0] | |
| 20 exonlen = tup[1] - tup[0] | |
| 21 if pos > exonlen: | |
| 22 pos = (pos - exonlen)-1 | |
| 23 else: | |
| 24 break | |
| 25 genomic_pos = (exon_start+pos)-1 | |
| 26 elif strand == "-": | |
| 27 exon_start = 0 | |
| 28 for tup in exons[::-1]: | |
| 29 exon_start = tup[1] | |
| 30 exonlen = tup[1] - tup[0] | |
| 31 if pos > exonlen: | |
| 32 pos = (pos - exonlen)-1 | |
| 33 else: | |
| 34 break | |
| 35 genomic_pos = (exon_start-pos)+1 | |
| 36 return (chrom, genomic_pos) | |
| 37 | |
| 38 | |
| 39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript | |
| 40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict | |
| 41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict): | |
| 42 readlen = len(readseq) | |
| 43 ambiguously_mapped_reads = 0 | |
| 44 #get the read name | |
| 45 read = list(process_chunk)[0] | |
| 46 | |
| 47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions | |
| 48 #used to store different genomic poistions | |
| 49 genomic_positions = [] | |
| 50 | |
| 51 #This section is just to get the different genomic positions the read aligns to | |
| 52 | |
| 53 for listname in process_chunk[read]: | |
| 54 | |
| 55 tran = listname[0].replace("-","_").replace("(","").replace(")","") | |
| 56 | |
| 57 pos = int(listname[1]) | |
| 58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) | |
| 59 #print ("genomic pos",genomic_pos) | |
| 60 if genomic_pos not in genomic_positions: | |
| 61 genomic_positions.append(genomic_pos) | |
| 62 | |
| 63 #If the read maps unambiguously | |
| 64 if len(genomic_positions) == 1: | |
| 65 if readlen not in unambig_read_length_dict: | |
| 66 unambig_read_length_dict[readlen] = 0 | |
| 67 unambig_read_length_dict[readlen] += 1 | |
| 68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True | |
| 69 coding=False | |
| 70 | |
| 71 # For each transcript this read alings to | |
| 72 for listname in process_chunk[read]: | |
| 73 #get the transcript name | |
| 74 tran = listname[0].replace("-","_").replace("(","").replace(")","") | |
| 75 #If we haven't come across this transcript already then add to master_read_dict | |
| 76 if tran not in master_read_dict: | |
| 77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} | |
| 78 #get the raw unedited positon, and read tags | |
| 79 pos = int(listname[1]) | |
| 80 read_tags = listname[2] | |
| 81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary | |
| 82 nm_tag = 0 | |
| 83 | |
| 84 for tag in read_tags: | |
| 85 if tag[0] == "NM": | |
| 86 nm_tag = int(tag[1]) | |
| 87 if nm_tag > 0: | |
| 88 md_tag = "" | |
| 89 for tag in read_tags: | |
| 90 if tag[0] == "MD": | |
| 91 md_tag = tag[1] | |
| 92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) | |
| 93 # Count the mismatches (we only do this for unambiguous) | |
| 94 for mismatch in mismatches: | |
| 95 #Ignore mismatches appearing in the first position (due to non templated addition) | |
| 96 if mismatch != 0: | |
| 97 char = mismatches[mismatch] | |
| 98 mismatch_pos = pos + mismatch | |
| 99 if mismatch_pos not in master_read_dict[tran]["seq"]: | |
| 100 master_read_dict[tran]["seq"][mismatch_pos] = {} | |
| 101 if char not in master_read_dict[tran]["seq"][mismatch_pos]: | |
| 102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0 | |
| 103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1 | |
| 104 # apply the modifiers | |
| 105 #pos = pos+pos_modifier | |
| 106 #readlen = readlen - readlen_modifier | |
| 107 | |
| 108 | |
| 109 try: | |
| 110 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
| 111 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
| 112 | |
| 113 if pos >= cds_start and pos <= cds_stop: | |
| 114 coding=True | |
| 115 except: | |
| 116 pass | |
| 117 | |
| 118 | |
| 119 if readlen in master_read_dict[tran]["unambig"]: | |
| 120 if pos in master_read_dict[tran]["unambig"][readlen]: | |
| 121 master_read_dict[tran]["unambig"][readlen][pos] += 1 | |
| 122 else: | |
| 123 master_read_dict[tran]["unambig"][readlen][pos] = 1 | |
| 124 else: | |
| 125 master_read_dict[tran]["unambig"][readlen] = {pos:1} | |
| 126 | |
| 127 if coding == True: | |
| 128 master_dict["unambiguous_coding_count"] += 1 | |
| 129 elif coding == False: | |
| 130 master_dict["unambiguous_non_coding_count"] += 1 | |
| 131 | |
| 132 else: | |
| 133 ambiguously_mapped_reads += 1 | |
| 134 for listname in process_chunk[read]: | |
| 135 tran = listname[0].replace("-","_").replace("(","").replace(")","") | |
| 136 if tran not in master_read_dict: | |
| 137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} | |
| 138 pos = int(listname[1]) | |
| 139 read_tags = listname[2] | |
| 140 nm_tag = 0 | |
| 141 for tag in read_tags: | |
| 142 if tag[0] == "NM": | |
| 143 nm_tag = int(tag[1]) | |
| 144 if nm_tag > 0: | |
| 145 md_tag = "" | |
| 146 for tag in read_tags: | |
| 147 if tag[0] == "MD": | |
| 148 md_tag = tag[1] | |
| 149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) | |
| 150 # apply the modifiers | |
| 151 #pos = pos+pos_modifier | |
| 152 #readlen = readlen - readlen_modifier | |
| 153 if readlen in master_read_dict[tran]["ambig"]: | |
| 154 if pos in master_read_dict[tran]["ambig"][readlen]: | |
| 155 master_read_dict[tran]["ambig"][readlen][pos] += 1 | |
| 156 else: | |
| 157 master_read_dict[tran]["ambig"][readlen][pos] = 1 | |
| 158 else: | |
| 159 master_read_dict[tran]["ambig"][readlen] = {pos:1} | |
| 160 return ambiguously_mapped_reads | |
| 161 | |
| 162 | |
| 163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq): | |
| 164 nucs = ["A","T","G","C"] | |
| 165 mismatches = {} | |
| 166 total_so_far = 0 | |
| 167 prev_char = "" | |
| 168 for char in md_tag: | |
| 169 if char in nucs: | |
| 170 if prev_char != "": | |
| 171 total_so_far += int(prev_char) | |
| 172 prev_char = "" | |
| 173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)]) | |
| 174 else: | |
| 175 if char != "^" and char != "N": | |
| 176 if prev_char == "": | |
| 177 prev_char = char | |
| 178 else: | |
| 179 total_so_far += int(prev_char+char) | |
| 180 prev_char = "" | |
| 181 readlen_modifier = 0 | |
| 182 pos_modifier = 0 | |
| 183 five_ok = False | |
| 184 three_ok = False | |
| 185 while five_ok == False: | |
| 186 for i in range(0,readlen): | |
| 187 if i in mismatches: | |
| 188 pos_modifier += 1 | |
| 189 readlen_modifier += 1 | |
| 190 else: | |
| 191 five_ok = True | |
| 192 break | |
| 193 five_ok = True | |
| 194 | |
| 195 | |
| 196 while three_ok == False: | |
| 197 for i in range(readlen-1,0,-1): | |
| 198 if i in mismatches: | |
| 199 readlen_modifier += 1 | |
| 200 else: | |
| 201 three_ok = True | |
| 202 break | |
| 203 three_ok = True | |
| 204 | |
| 205 | |
| 206 return (pos_modifier, readlen_modifier, mismatches) | |
| 207 | |
| 208 | |
| 209 | |
| 210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile): | |
| 211 desc = "NULL" | |
| 212 start_time = time.time() | |
| 213 study_dict ={} | |
| 214 nuc_count_dict = {"mapped":{},"unmapped":{}} | |
| 215 dinuc_count_dict = {} | |
| 216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}} | |
| 217 read_length_dict = {} | |
| 218 unambig_read_length_dict = {} | |
| 219 unmapped_dict = {} | |
| 220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()} | |
| 221 | |
| 222 transcriptome_info_dict = {} | |
| 223 connection = sqlite3.connect(transcriptome_info_dict_path) | |
| 224 cursor = connection.cursor() | |
| 225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;") | |
| 226 result = cursor.fetchall() | |
| 227 for row in result: | |
| 228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]} | |
| 229 #print list(transcriptome_info_dict)[:10] | |
| 230 | |
| 231 cursor.execute("SELECT * from exons;") | |
| 232 result = cursor.fetchall() | |
| 233 for row in result: | |
| 234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2])) | |
| 235 | |
| 236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers | |
| 237 multimappers = 0 | |
| 238 unmapped_reads = 0 | |
| 239 unambiguous_coding_count = 0 | |
| 240 unambiguous_non_coding_count = 0 | |
| 241 trip_periodicity_reads = 0 | |
| 242 | |
| 243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}} | |
| 244 master_read_dict = {} | |
| 245 prev_seq = "" | |
| 246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]} | |
| 247 mapped_reads = 0 | |
| 248 ambiguously_mapped_reads = 0 | |
| 249 master_trip_dict = {"fiveprime":{}, "threeprime":{}} | |
| 250 master_offset_dict = {"fiveprime":{}, "threeprime":{}} | |
| 251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}} | |
| 252 | |
| 253 | |
| 254 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam 2> /dev/null') | |
| 255 | |
| 256 pysam.set_verbosity(0) | |
| 257 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") | |
| 258 header = infile.header["HD"] | |
| 259 | |
| 260 unsorted = False | |
| 261 if "SO" in header: | |
| 262 print("Sorting order: "+header["SO"]) | |
| 263 if header["SO"] != "queryname": | |
| 264 print("Sorting order is not queryname") | |
| 265 unsorted = True | |
| 266 else: | |
| 267 unsorted = True | |
| 268 if unsorted == True: | |
| 269 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam") | |
| 270 print (header,bam_filepath) | |
| 271 sys.exit() | |
| 272 total_bam_lines = 0 | |
| 273 all_ref_ids = infile.references | |
| 274 | |
| 275 for read in infile.fetch(until_eof=True): | |
| 276 total_bam_lines += 1 | |
| 277 if not read.is_unmapped: | |
| 278 ref = read.reference_id | |
| 279 tran = (all_ref_ids[ref]).split(".")[0] | |
| 280 mapped_reads += 1 | |
| 281 if mapped_reads%1000000 == 0: | |
| 282 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time))) | |
| 283 pos = read.reference_start | |
| 284 readname = read.query_name | |
| 285 read_tags = read.tags | |
| 286 if readname == list(process_chunk)[0]: | |
| 287 process_chunk[readname].append([tran,pos,read_tags]) | |
| 288 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read | |
| 289 else: | |
| 290 if list(process_chunk)[0] != "read_name": | |
| 291 | |
| 292 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because | |
| 293 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) | |
| 294 seq = read.seq | |
| 295 readlen = len(seq) | |
| 296 | |
| 297 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) | |
| 298 if readlen not in read_length_dict: | |
| 299 read_length_dict[readlen] = 0 | |
| 300 read_length_dict[readlen] += 1 | |
| 301 | |
| 302 if readlen not in nuc_count_dict["mapped"]: | |
| 303 nuc_count_dict["mapped"][readlen] = {} | |
| 304 if readlen not in threeprime_nuc_count_dict["mapped"]: | |
| 305 threeprime_nuc_count_dict["mapped"][readlen] = {} | |
| 306 if readlen not in dinuc_count_dict: | |
| 307 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0, | |
| 308 "AT":0, "TT":0, "GT":0, "CT":0, | |
| 309 "AG":0, "TG":0, "GG":0, "CG":0, | |
| 310 "AC":0, "TC":0, "GC":0, "CC":0} | |
| 311 | |
| 312 for i in range(0,len(seq)): | |
| 313 if i not in nuc_count_dict["mapped"][readlen]: | |
| 314 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} | |
| 315 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 | |
| 316 | |
| 317 for i in range(0,len(seq)): | |
| 318 try: | |
| 319 dinuc_count_dict[readlen][seq[i:i+2]] += 1 | |
| 320 except: | |
| 321 pass | |
| 322 | |
| 323 for i in range(len(seq),0,-1): | |
| 324 dist = i-len(seq) | |
| 325 if dist not in threeprime_nuc_count_dict["mapped"][readlen]: | |
| 326 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} | |
| 327 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1 | |
| 328 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict) | |
| 329 process_chunk = {readname:[[tran, pos, read_tags]]} | |
| 330 prev_seq = read.seq | |
| 331 else: | |
| 332 unmapped_reads += 1 | |
| 333 | |
| 334 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. | |
| 335 seq = read.seq | |
| 336 readlen = len(seq) | |
| 337 if seq in unmapped_dict: | |
| 338 unmapped_dict[seq] += 1 | |
| 339 else: | |
| 340 unmapped_dict[seq] = 1 | |
| 341 | |
| 342 # Populate the nuc_count_dict with this unmapped read | |
| 343 if readlen not in nuc_count_dict["unmapped"]: | |
| 344 nuc_count_dict["unmapped"][readlen] = {} | |
| 345 for i in range(0,len(seq)): | |
| 346 if i not in nuc_count_dict["unmapped"][readlen]: | |
| 347 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} | |
| 348 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 | |
| 349 | |
| 350 if readlen not in threeprime_nuc_count_dict["unmapped"]: | |
| 351 threeprime_nuc_count_dict["unmapped"][readlen] = {} | |
| 352 | |
| 353 for i in range(len(seq),0,-1): | |
| 354 dist = i-len(seq) | |
| 355 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: | |
| 356 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} | |
| 357 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 | |
| 358 | |
| 359 #add stats about mapped/unmapped reads to file dict which will be used for the final report | |
| 360 master_dict["total_bam_lines"] = total_bam_lines | |
| 361 master_dict["mapped_reads"] = mapped_reads | |
| 362 master_dict["unmapped_reads"] = unmapped_reads | |
| 363 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads | |
| 364 | |
| 365 if "read_name" in master_read_dict: | |
| 366 del master_read_dict["read_name"] | |
| 367 print ("BAM file processed") | |
| 368 print ("Creating metagenes, triplet periodicity plots, etc.") | |
| 369 | |
| 370 for tran in master_read_dict: | |
| 371 try: | |
| 372 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"]) | |
| 373 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"]) | |
| 374 # print(tran, type(cds_start)) | |
| 375 except: | |
| 376 print("Exception: ", tran) | |
| 377 continue | |
| 378 | |
| 379 tranlen = transcriptome_info_dict[tran]["length"] | |
| 380 #Use this to discard transcripts with no 5' leader or 3' trailer | |
| 381 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1: | |
| 382 for primetype in ["fiveprime", "threeprime"]: | |
| 383 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads | |
| 384 for readlength in master_read_dict[tran]["unambig"]: | |
| 385 #print "readlength", readlength | |
| 386 # for each fiveprime postion for this readlength within this transcript | |
| 387 for raw_pos in master_read_dict[tran]["unambig"][readlength]: | |
| 388 #print "raw pos", raw_pos | |
| 389 trip_periodicity_reads += 1 | |
| 390 if primetype == "fiveprime": | |
| 391 # get the five prime postion minus the cds start postion | |
| 392 real_pos = raw_pos-cds_start | |
| 393 rel_stop_pos = raw_pos-cds_stop | |
| 394 elif primetype == "threeprime": | |
| 395 real_pos = (raw_pos+readlength)-cds_start | |
| 396 rel_stop_pos = (raw_pos+readlength)-cds_stop | |
| 397 #get the readcount at the raw postion | |
| 398 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos] | |
| 399 #print "readcount", readcount | |
| 400 frame = (real_pos%3) | |
| 401 if real_pos >= cds_start and real_pos <= cds_stop: | |
| 402 if readlength in master_trip_dict[primetype]: | |
| 403 master_trip_dict[primetype][readlength][str(frame)] += readcount | |
| 404 else: | |
| 405 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0} | |
| 406 master_trip_dict[primetype][readlength][str(frame)] += readcount | |
| 407 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | |
| 408 if real_pos > (-600) and real_pos < (601): | |
| 409 if readlength in master_offset_dict[primetype]: | |
| 410 if real_pos in master_offset_dict[primetype][readlength]: | |
| 411 #print "real pos in offset dict" | |
| 412 master_offset_dict[primetype][readlength][real_pos] += readcount | |
| 413 else: | |
| 414 #print "real pos not in offset dict" | |
| 415 master_offset_dict[primetype][readlength][real_pos] = readcount | |
| 416 else: | |
| 417 #initiliase with zero to avoid missing neighbours below | |
| 418 #print "initialising with zeros" | |
| 419 master_offset_dict[primetype][readlength]= {} | |
| 420 for i in range(-600,601): | |
| 421 master_offset_dict[primetype][readlength][i] = 0 | |
| 422 master_offset_dict[primetype][readlength][real_pos] += readcount | |
| 423 | |
| 424 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | |
| 425 if rel_stop_pos > (-600) and rel_stop_pos < (601): | |
| 426 if readlength in master_metagene_stop_dict[primetype]: | |
| 427 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]: | |
| 428 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount | |
| 429 else: | |
| 430 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount | |
| 431 else: | |
| 432 #initiliase with zero to avoid missing neighbours below | |
| 433 master_metagene_stop_dict[primetype][readlength] = {} | |
| 434 for i in range(-600,601): | |
| 435 master_metagene_stop_dict[primetype][readlength][i] = 0 | |
| 436 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount | |
| 437 | |
| 438 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame | |
| 439 # create a 'score' for each readlength by putting the max frame count over the second highest frame count | |
| 440 for primetype in ["fiveprime", "threeprime"]: | |
| 441 for subreadlength in master_trip_dict[primetype]: | |
| 442 maxcount = 0 | |
| 443 secondmaxcount = 0 | |
| 444 for frame in master_trip_dict[primetype][subreadlength]: | |
| 445 if master_trip_dict[primetype][subreadlength][frame] > maxcount: | |
| 446 maxcount = master_trip_dict[primetype][subreadlength][frame] | |
| 447 for frame in master_trip_dict[primetype][subreadlength]: | |
| 448 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount: | |
| 449 secondmaxcount = master_trip_dict[primetype][subreadlength][frame] | |
| 450 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height | |
| 451 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount) | |
| 452 #This part is to determine what offsets to give each read length | |
| 453 print ("Calculating offsets") | |
| 454 for primetype in ["fiveprime", "threeprime"]: | |
| 455 for readlen in master_offset_dict[primetype]: | |
| 456 accepted_len = False | |
| 457 max_relative_pos = 0 | |
| 458 max_relative_count = 0 | |
| 459 for relative_pos in master_offset_dict[primetype][readlen]: | |
| 460 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) | |
| 461 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10): | |
| 462 continue | |
| 463 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count: | |
| 464 max_relative_pos = relative_pos | |
| 465 max_relative_count = master_offset_dict[primetype][readlen][relative_pos] | |
| 466 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) | |
| 467 if primetype == "fiveprime": | |
| 468 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall | |
| 469 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2) | |
| 470 elif primetype == "threeprime": | |
| 471 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall | |
| 472 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2 | |
| 473 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict | |
| 474 if readlen in master_trip_dict[primetype]: | |
| 475 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"] | |
| 476 else: | |
| 477 final_offsets[primetype]["read_scores"][readlen] = 0.0 | |
| 478 | |
| 479 | |
| 480 master_read_dict["unmapped_reads"] = unmapped_reads | |
| 481 master_read_dict["offsets"] = final_offsets | |
| 482 master_read_dict["trip_periodicity"] = master_trip_dict | |
| 483 master_read_dict["desc"] = "Null" | |
| 484 master_read_dict["mapped_reads"] = mapped_reads | |
| 485 master_read_dict["nuc_counts"] = nuc_count_dict | |
| 486 master_read_dict["dinuc_counts"] = dinuc_count_dict | |
| 487 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict | |
| 488 master_read_dict["metagene_counts"] = master_offset_dict | |
| 489 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict | |
| 490 master_read_dict["read_lengths"] = read_length_dict | |
| 491 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict | |
| 492 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] | |
| 493 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] | |
| 494 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] | |
| 495 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:] | |
| 496 master_read_dict["cutadapt_removed"] = 0 | |
| 497 master_read_dict["rrna_removed"] = 0 | |
| 498 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line | |
| 499 master_read_dict["removed_minus_m"] = 0 | |
| 500 master_dict["removed_minus_m"] = 0 | |
| 501 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets | |
| 502 master_read_dict["unambiguous_all_totals"] = {} | |
| 503 master_read_dict["unambiguous_fiveprime_totals"] = {} | |
| 504 master_read_dict["unambiguous_cds_totals"] = {} | |
| 505 master_read_dict["unambiguous_threeprime_totals"] = {} | |
| 506 | |
| 507 master_read_dict["ambiguous_all_totals"] = {} | |
| 508 master_read_dict["ambiguous_fiveprime_totals"] = {} | |
| 509 master_read_dict["ambiguous_cds_totals"] = {} | |
| 510 master_read_dict["ambiguous_threeprime_totals"] = {} | |
| 511 print ("calculating transcript counts") | |
| 512 for tran in master_read_dict: | |
| 513 if tran in transcriptome_info_dict: | |
| 514 five_total = 0 | |
| 515 cds_total = 0 | |
| 516 three_total = 0 | |
| 517 | |
| 518 ambig_five_total = 0 | |
| 519 ambig_cds_total = 0 | |
| 520 ambig_three_total = 0 | |
| 521 | |
| 522 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
| 523 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
| 524 | |
| 525 for readlen in master_read_dict[tran]["unambig"]: | |
| 526 if readlen in final_offsets["fiveprime"]["offsets"]: | |
| 527 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
| 528 else: | |
| 529 offset = 15 | |
| 530 for pos in master_read_dict[tran]["unambig"][readlen]: | |
| 531 real_pos = pos+offset | |
| 532 if cds_start is None or cds_stop is None: | |
| 533 three_total += master_read_dict[tran]["unambig"][readlen][pos] | |
| 534 else: | |
| 535 if real_pos <cds_start: | |
| 536 five_total += master_read_dict[tran]["unambig"][readlen][pos] | |
| 537 elif real_pos >=cds_start and real_pos <= cds_stop: | |
| 538 cds_total += master_read_dict[tran]["unambig"][readlen][pos] | |
| 539 elif real_pos > cds_stop: | |
| 540 three_total += master_read_dict[tran]["unambig"][readlen][pos] | |
| 541 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total | |
| 542 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total | |
| 543 master_read_dict["unambiguous_cds_totals"][tran] = cds_total | |
| 544 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total | |
| 545 | |
| 546 for readlen in master_read_dict[tran]["ambig"]: | |
| 547 if readlen in final_offsets["fiveprime"]["offsets"]: | |
| 548 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
| 549 else: | |
| 550 offset = 15 | |
| 551 for pos in master_read_dict[tran]["ambig"][readlen]: | |
| 552 if cds_start is None or cds_stop is None: | |
| 553 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] | |
| 554 else: | |
| 555 real_pos = pos+offset | |
| 556 if real_pos < cds_start: | |
| 557 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos] | |
| 558 elif real_pos >=cds_start and real_pos <= cds_stop: | |
| 559 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] | |
| 560 elif real_pos > cds_stop: | |
| 561 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] | |
| 562 | |
| 563 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total | |
| 564 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total | |
| 565 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total | |
| 566 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total | |
| 567 | |
| 568 print ("Writing out to sqlite file") | |
| 569 sqlite_db = SqliteDict(outputfile,autocommit=False) | |
| 570 for key in master_read_dict: | |
| 571 sqlite_db[key] = master_read_dict[key] | |
| 572 sqlite_db["description"] = desc | |
| 573 sqlite_db.commit() | |
| 574 sqlite_db.close() | |
| 575 | |
| 576 | |
| 577 if __name__ == "__main__": | |
| 578 if len(sys.argv) <= 2: | |
| 579 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>") | |
| 580 sys.exit() | |
| 581 bam_filepath = sys.argv[1] | |
| 582 annotation_sqlite_filepath = sys.argv[2] | |
| 583 desc = sys.argv[3] | |
| 584 outputfile = sys.argv[4] | |
| 585 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile) |
