comparison trips_bam_to_sqlite/bam_to_sqlite.py @ 9:0e88342d5794 draft

Uploaded
author jackcurragh
date Tue, 25 Oct 2022 08:10:50 +0000
parents 2c6f630c732f
children 3d2b1532a1b0
comparison
equal deleted inserted replaced
8:0fcffa7365b9 9:0e88342d5794
4 import os 4 import os
5 import time 5 import time
6 import sqlite3 6 import sqlite3
7 from sqlitedict import SqliteDict 7 from sqlitedict import SqliteDict
8 8
9
10 def tran_to_genome(tran, pos, transcriptome_info_dict): 9 def tran_to_genome(tran, pos, transcriptome_info_dict):
11 # print ("tran",list(transcriptome_info_dict)) 10 #print ("tran",list(transcriptome_info_dict))
12 traninfo = transcriptome_info_dict[tran] 11 traninfo = transcriptome_info_dict[tran]
13 chrom = traninfo["chrom"] 12 chrom = traninfo["chrom"]
14 strand = traninfo["strand"] 13 strand = traninfo["strand"]
15 exons = sorted(traninfo["exons"]) 14 exons = sorted(traninfo["exons"])
16 # print exons 15 #print exons
17 if strand == "+": 16 if strand == "+":
18 exon_start = 0 17 exon_start = 0
19 for tup in exons: 18 for tup in exons:
20 exon_start = tup[0] 19 exon_start = tup[0]
21 exonlen = tup[1] - tup[0] 20 exonlen = tup[1] - tup[0]
22 if pos > exonlen: 21 if pos > exonlen:
23 pos = (pos - exonlen) - 1 22 pos = (pos - exonlen)-1
24 else: 23 else:
25 break 24 break
26 genomic_pos = (exon_start + pos) - 1 25 genomic_pos = (exon_start+pos)-1
27 elif strand == "-": 26 elif strand == "-":
28 exon_start = 0 27 exon_start = 0
29 for tup in exons[::-1]: 28 for tup in exons[::-1]:
30 exon_start = tup[1] 29 exon_start = tup[1]
31 exonlen = tup[1] - tup[0] 30 exonlen = tup[1] - tup[0]
32 if pos > exonlen: 31 if pos > exonlen:
33 pos = (pos - exonlen) - 1 32 pos = (pos - exonlen)-1
34 else: 33 else:
35 break 34 break
36 genomic_pos = (exon_start - pos) + 1 35 genomic_pos = (exon_start-pos)+1
37 return (chrom, genomic_pos) 36 return (chrom, genomic_pos)
38 37
39 38
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript 39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict 40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
42 def processor( 41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict):
43 process_chunk, 42 readlen = len(readseq)
44 master_read_dict, 43 ambiguously_mapped_reads = 0
45 transcriptome_info_dict, 44 #get the read name
46 master_dict, 45 read = list(process_chunk)[0]
47 readseq, 46
48 unambig_read_length_dict, 47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions
49 ): 48 #used to store different genomic poistions
50 readlen = len(readseq) 49 genomic_positions = []
51 ambiguously_mapped_reads = 0 50
52 # get the read name 51 #This section is just to get the different genomic positions the read aligns to
53 read = list(process_chunk)[0] 52
54 53 for listname in process_chunk[read]:
55 read_list = process_chunk[ 54
56 read 55 tran = listname[0].replace("-","_").replace("(","").replace(")","")
57 ] # a list of lists of all transcripts the read aligns to and the positions 56
58 # used to store different genomic poistions 57 pos = int(listname[1])
59 genomic_positions = [] 58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
60 59 #print ("genomic pos",genomic_pos)
61 # This section is just to get the different genomic positions the read aligns to 60 if genomic_pos not in genomic_positions:
62 61 genomic_positions.append(genomic_pos)
63 for listname in process_chunk[read]: 62
64 63 #If the read maps unambiguously
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") 64 if len(genomic_positions) == 1:
66 65 if readlen not in unambig_read_length_dict:
67 pos = int(listname[1]) 66 unambig_read_length_dict[readlen] = 0
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) 67 unambig_read_length_dict[readlen] += 1
69 # print ("genomic pos",genomic_pos) 68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
70 if genomic_pos not in genomic_positions: 69 coding=False
71 genomic_positions.append(genomic_pos) 70
72 71 # For each transcript this read alings to
73 # If the read maps unambiguously 72 for listname in process_chunk[read]:
74 if len(genomic_positions) == 1: 73 #get the transcript name
75 if readlen not in unambig_read_length_dict: 74 tran = listname[0].replace("-","_").replace("(","").replace(")","")
76 unambig_read_length_dict[readlen] = 0 75 #If we haven't come across this transcript already then add to master_read_dict
77 unambig_read_length_dict[readlen] += 1 76 if tran not in master_read_dict:
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True 77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
79 coding = False 78 #get the raw unedited positon, and read tags
80 79 pos = int(listname[1])
81 # For each transcript this read alings to 80 read_tags = listname[2]
82 for listname in process_chunk[read]: 81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
83 # get the transcript name 82 nm_tag = 0
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") 83
85 # If we haven't come across this transcript already then add to master_read_dict 84 for tag in read_tags:
86 if tran not in master_read_dict: 85 if tag[0] == "NM":
87 master_read_dict[tran] = { 86 nm_tag = int(tag[1])
88 "ambig": {}, 87 if nm_tag > 0:
89 "unambig": {}, 88 md_tag = ""
90 "mismatches": {}, 89 for tag in read_tags:
91 "seq": {}, 90 if tag[0] == "MD":
92 } 91 md_tag = tag[1]
93 # get the raw unedited positon, and read tags 92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
94 pos = int(listname[1]) 93 # Count the mismatches (we only do this for unambiguous)
95 read_tags = listname[2] 94 for mismatch in mismatches:
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary 95 #Ignore mismatches appearing in the first position (due to non templated addition)
97 nm_tag = 0 96 if mismatch != 0:
98 97 char = mismatches[mismatch]
99 for tag in read_tags: 98 mismatch_pos = pos + mismatch
100 if tag[0] == "NM": 99 if mismatch_pos not in master_read_dict[tran]["seq"]:
101 nm_tag = int(tag[1]) 100 master_read_dict[tran]["seq"][mismatch_pos] = {}
102 if nm_tag > 0: 101 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
103 md_tag = "" 102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
104 for tag in read_tags: 103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
105 if tag[0] == "MD": 104 # apply the modifiers
106 md_tag = tag[1] 105 #pos = pos+pos_modifier
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( 106 #readlen = readlen - readlen_modifier
108 md_tag, pos, readlen, master_read_dict, tran, readseq 107
109 ) 108
110 # Count the mismatches (we only do this for unambiguous) 109 try:
111 for mismatch in mismatches: 110 cds_start = transcriptome_info_dict[tran]["cds_start"]
112 # Ignore mismatches appearing in the first position (due to non templated addition) 111 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
113 if mismatch != 0: 112
114 char = mismatches[mismatch] 113 if pos >= cds_start and pos <= cds_stop:
115 mismatch_pos = pos + mismatch 114 coding=True
116 if mismatch_pos not in master_read_dict[tran]["seq"]: 115 except:
117 master_read_dict[tran]["seq"][mismatch_pos] = {} 116 pass
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]: 117
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0 118
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1 119 if readlen in master_read_dict[tran]["unambig"]:
121 # apply the modifiers 120 if pos in master_read_dict[tran]["unambig"][readlen]:
122 # pos = pos+pos_modifier 121 master_read_dict[tran]["unambig"][readlen][pos] += 1
123 # readlen = readlen - readlen_modifier 122 else:
124 123 master_read_dict[tran]["unambig"][readlen][pos] = 1
125 try: 124 else:
126 cds_start = transcriptome_info_dict[tran]["cds_start"] 125 master_read_dict[tran]["unambig"][readlen] = {pos:1}
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"] 126
128 127 if coding == True:
129 if pos >= cds_start and pos <= cds_stop: 128 master_dict["unambiguous_coding_count"] += 1
130 coding = True 129 elif coding == False:
131 except: 130 master_dict["unambiguous_non_coding_count"] += 1
132 pass 131
133 132 else:
134 if readlen in master_read_dict[tran]["unambig"]: 133 ambiguously_mapped_reads += 1
135 if pos in master_read_dict[tran]["unambig"][readlen]: 134 for listname in process_chunk[read]:
136 master_read_dict[tran]["unambig"][readlen][pos] += 1 135 tran = listname[0].replace("-","_").replace("(","").replace(")","")
137 else: 136 if tran not in master_read_dict:
138 master_read_dict[tran]["unambig"][readlen][pos] = 1 137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}}
139 else: 138 pos = int(listname[1])
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1} 139 read_tags = listname[2]
141 140 nm_tag = 0
142 if coding == True: 141 for tag in read_tags:
143 master_dict["unambiguous_coding_count"] += 1 142 if tag[0] == "NM":
144 elif coding == False: 143 nm_tag = int(tag[1])
145 master_dict["unambiguous_non_coding_count"] += 1 144 if nm_tag > 0:
146 145 md_tag = ""
147 else: 146 for tag in read_tags:
148 ambiguously_mapped_reads += 1 147 if tag[0] == "MD":
149 for listname in process_chunk[read]: 148 md_tag = tag[1]
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") 149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq)
151 if tran not in master_read_dict: 150 # apply the modifiers
152 master_read_dict[tran] = { 151 #pos = pos+pos_modifier
153 "ambig": {}, 152 #readlen = readlen - readlen_modifier
154 "unambig": {}, 153 if readlen in master_read_dict[tran]["ambig"]:
155 "mismatches": {}, 154 if pos in master_read_dict[tran]["ambig"][readlen]:
156 "seq": {}, 155 master_read_dict[tran]["ambig"][readlen][pos] += 1
157 } 156 else:
158 pos = int(listname[1]) 157 master_read_dict[tran]["ambig"][readlen][pos] = 1
159 read_tags = listname[2] 158 else:
160 nm_tag = 0 159 master_read_dict[tran]["ambig"][readlen] = {pos:1}
161 for tag in read_tags: 160 return ambiguously_mapped_reads
162 if tag[0] == "NM": 161
163 nm_tag = int(tag[1]) 162
164 if nm_tag > 0: 163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq):
165 md_tag = "" 164 nucs = ["A","T","G","C"]
166 for tag in read_tags: 165 mismatches = {}
167 if tag[0] == "MD": 166 total_so_far = 0
168 md_tag = tag[1] 167 prev_char = ""
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( 168 for char in md_tag:
170 md_tag, pos, readlen, master_read_dict, tran, readseq 169 if char in nucs:
171 ) 170 if prev_char != "":
172 # apply the modifiers 171 total_so_far += int(prev_char)
173 # pos = pos+pos_modifier 172 prev_char = ""
174 # readlen = readlen - readlen_modifier 173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)])
175 if readlen in master_read_dict[tran]["ambig"]: 174 else:
176 if pos in master_read_dict[tran]["ambig"][readlen]: 175 if char != "^" and char != "N":
177 master_read_dict[tran]["ambig"][readlen][pos] += 1 176 if prev_char == "":
178 else: 177 prev_char = char
179 master_read_dict[tran]["ambig"][readlen][pos] = 1 178 else:
180 else: 179 total_so_far += int(prev_char+char)
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1} 180 prev_char = ""
182 return ambiguously_mapped_reads 181 readlen_modifier = 0
183 182 pos_modifier = 0
184 183 five_ok = False
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq): 184 three_ok = False
186 nucs = ["A", "T", "G", "C"] 185 while five_ok == False:
187 mismatches = {} 186 for i in range(0,readlen):
188 total_so_far = 0 187 if i in mismatches:
189 prev_char = "" 188 pos_modifier += 1
190 for char in md_tag: 189 readlen_modifier += 1
191 if char in nucs: 190 else:
192 if prev_char != "": 191 five_ok = True
193 total_so_far += int(prev_char) 192 break
194 prev_char = "" 193 five_ok = True
195 mismatches[total_so_far + len(mismatches)] = readseq[ 194
196 total_so_far + len(mismatches) 195
197 ] 196 while three_ok == False:
198 else: 197 for i in range(readlen-1,0,-1):
199 if char != "^" and char != "N": 198 if i in mismatches:
200 if prev_char == "": 199 readlen_modifier += 1
201 prev_char = char 200 else:
202 else: 201 three_ok = True
203 total_so_far += int(prev_char + char) 202 break
204 prev_char = "" 203 three_ok = True
205 readlen_modifier = 0 204
206 pos_modifier = 0 205
207 five_ok = False 206 return (pos_modifier, readlen_modifier, mismatches)
208 three_ok = False 207
209 while five_ok == False: 208
210 for i in range(0, readlen): 209
211 if i in mismatches: 210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile):
212 pos_modifier += 1 211 desc = "NULL"
213 readlen_modifier += 1 212 start_time = time.time()
214 else: 213 study_dict ={}
215 five_ok = True 214 nuc_count_dict = {"mapped":{},"unmapped":{}}
216 break 215 dinuc_count_dict = {}
217 five_ok = True 216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}}
218 217 read_length_dict = {}
219 while three_ok == False: 218 unambig_read_length_dict = {}
220 for i in range(readlen - 1, 0, -1): 219 unmapped_dict = {}
221 if i in mismatches: 220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()}
222 readlen_modifier += 1 221
223 else: 222 transcriptome_info_dict = {}
224 three_ok = True 223 connection = sqlite3.connect(transcriptome_info_dict_path)
225 break 224 cursor = connection.cursor()
226 three_ok = True 225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;")
227 226 result = cursor.fetchall()
228 return (pos_modifier, readlen_modifier, mismatches) 227 for row in result:
229 228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]}
230 229 #print list(transcriptome_info_dict)[:10]
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile, desc): 230
232 desc = desc 231 cursor.execute("SELECT * from exons;")
233 start_time = time.time() 232 result = cursor.fetchall()
234 study_dict = {} 233 for row in result:
235 nuc_count_dict = {"mapped": {}, "unmapped": {}} 234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2]))
236 dinuc_count_dict = {} 235
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}} 236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
238 read_length_dict = {} 237 multimappers = 0
239 unambig_read_length_dict = {} 238 unmapped_reads = 0
240 unmapped_dict = {} 239 unambiguous_coding_count = 0
241 master_dict = { 240 unambiguous_non_coding_count = 0
242 "unambiguous_non_coding_count": 0, 241 trip_periodicity_reads = 0
243 "unambiguous_coding_count": 0, 242
244 "current_dir": os.getcwd(), 243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}}
245 } 244 master_read_dict = {}
246 245 prev_seq = ""
247 transcriptome_info_dict = {} 246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]}
248 connection = sqlite3.connect(transcriptome_info_dict_path) 247 mapped_reads = 0
249 cursor = connection.cursor() 248 ambiguously_mapped_reads = 0
250 cursor.execute( 249 master_trip_dict = {"fiveprime":{}, "threeprime":{}}
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;" 250 master_offset_dict = {"fiveprime":{}, "threeprime":{}}
252 ) 251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}}
253 result = cursor.fetchall() 252
254 for row in result: 253 pysam.set_verbosity(0)
255 transcriptome_info_dict[str(row[0])] = { 254 infile = pysam.Samfile(bam_filepath, "rb")
256 "cds_start": row[1], 255
257 "cds_stop": row[2], 256 header = infile.header["HD"]
258 "length": row[3], 257 unsorted = False
259 "strand": row[4], 258 if "SO" in header:
260 "chrom": row[5], 259 if header["SO"] != "queryname":
261 "exons": [], 260 unsorted = True
262 "tran_type": row[6], 261 else:
263 } 262 unsorted = True
264 # print list(transcriptome_info_dict)[:10] 263 if unsorted == True:
265 264 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam")
266 cursor.execute("SELECT * from exons;") 265 print (header,bam_filepath)
267 result = cursor.fetchall() 266 sys.exit()
268 for row in result: 267 total_bam_lines = 0
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2])) 268 all_ref_ids = infile.references
270 269
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers 270 for read in infile.fetch(until_eof=True):
272 multimappers = 0 271 total_bam_lines += 1
273 unmapped_reads = 0 272 if not read.is_unmapped:
274 unambiguous_coding_count = 0 273 ref = read.reference_id
275 unambiguous_non_coding_count = 0 274 tran = (all_ref_ids[ref]).split(".")[0]
276 trip_periodicity_reads = 0 275 mapped_reads += 1
277 276 if mapped_reads%1000000 == 0:
278 final_offsets = { 277 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time)))
279 "fiveprime": {"offsets": {}, "read_scores": {}}, 278 pos = read.reference_start
280 "threeprime": {"offsets": {}, "read_scores": {}}, 279 readname = read.query_name
281 } 280 read_tags = read.tags
282 master_read_dict = {} 281 if readname == list(process_chunk)[0]:
283 prev_seq = "" 282 process_chunk[readname].append([tran,pos,read_tags])
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]} 283 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
285 mapped_reads = 0 284 else:
286 ambiguously_mapped_reads = 0 285 if list(process_chunk)[0] != "read_name":
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}} 286
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}} 287 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}} 288 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
290 289 seq = read.seq
291 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam') 290 readlen = len(seq)
292 pysam.set_verbosity(0) 291
293 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") 292 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
294 header = infile.header["HD"] 293 if readlen not in read_length_dict:
295 unsorted = False 294 read_length_dict[readlen] = 0
296 if "SO" in header: 295 read_length_dict[readlen] += 1
297 if header["SO"] != "queryname": 296
298 unsorted = True 297 if readlen not in nuc_count_dict["mapped"]:
299 else: 298 nuc_count_dict["mapped"][readlen] = {}
300 unsorted = True 299 if readlen not in threeprime_nuc_count_dict["mapped"]:
301 if unsorted == True: 300 threeprime_nuc_count_dict["mapped"][readlen] = {}
302 print( 301 if readlen not in dinuc_count_dict:
303 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam" 302 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0,
304 ) 303 "AT":0, "TT":0, "GT":0, "CT":0,
305 print(header, bam_filepath) 304 "AG":0, "TG":0, "GG":0, "CG":0,
306 sys.exit() 305 "AC":0, "TC":0, "GC":0, "CC":0}
307 total_bam_lines = 0 306
308 all_ref_ids = infile.references 307 for i in range(0,len(seq)):
309 308 if i not in nuc_count_dict["mapped"][readlen]:
310 for read in infile.fetch(until_eof=True): 309 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
311 total_bam_lines += 1 310 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
312 if not read.is_unmapped: 311
313 ref = read.reference_id 312 for i in range(0,len(seq)):
314 tran = (all_ref_ids[ref]).split(".")[0] 313 try:
315 mapped_reads += 1 314 dinuc_count_dict[readlen][seq[i:i+2]] += 1
316 if mapped_reads % 1000000 == 0: 315 except:
317 print( 316 pass
318 "{} reads parsed at {}".format( 317
319 mapped_reads, (time.time() - start_time) 318 for i in range(len(seq),0,-1):
320 ) 319 dist = i-len(seq)
321 ) 320 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
322 pos = read.reference_start 321 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
323 readname = read.query_name 322 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1
324 read_tags = read.tags 323 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict)
325 if readname == list(process_chunk)[0]: 324 process_chunk = {readname:[[tran, pos, read_tags]]}
326 process_chunk[readname].append([tran, pos, read_tags]) 325 prev_seq = read.seq
327 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read 326 else:
328 else: 327 unmapped_reads += 1
329 if list(process_chunk)[0] != "read_name": 328
330 329 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
331 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because 330 seq = read.seq
332 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) 331 readlen = len(seq)
333 seq = read.seq 332 if seq in unmapped_dict:
334 readlen = len(seq) 333 unmapped_dict[seq] += 1
335 334 else:
336 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) 335 unmapped_dict[seq] = 1
337 if readlen not in read_length_dict: 336
338 read_length_dict[readlen] = 0 337 # Populate the nuc_count_dict with this unmapped read
339 read_length_dict[readlen] += 1 338 if readlen not in nuc_count_dict["unmapped"]:
340 339 nuc_count_dict["unmapped"][readlen] = {}
341 if readlen not in nuc_count_dict["mapped"]: 340 for i in range(0,len(seq)):
342 nuc_count_dict["mapped"][readlen] = {} 341 if i not in nuc_count_dict["unmapped"][readlen]:
343 if readlen not in threeprime_nuc_count_dict["mapped"]: 342 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0}
344 threeprime_nuc_count_dict["mapped"][readlen] = {} 343 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
345 if readlen not in dinuc_count_dict: 344
346 dinuc_count_dict[readlen] = { 345 if readlen not in threeprime_nuc_count_dict["unmapped"]:
347 "AA": 0, 346 threeprime_nuc_count_dict["unmapped"][readlen] = {}
348 "TA": 0, 347
349 "GA": 0, 348 for i in range(len(seq),0,-1):
350 "CA": 0, 349 dist = i-len(seq)
351 "AT": 0, 350 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
352 "TT": 0, 351 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0}
353 "GT": 0, 352 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
354 "CT": 0, 353
355 "AG": 0, 354 #add stats about mapped/unmapped reads to file dict which will be used for the final report
356 "TG": 0, 355 master_dict["total_bam_lines"] = total_bam_lines
357 "GG": 0, 356 master_dict["mapped_reads"] = mapped_reads
358 "CG": 0, 357 master_dict["unmapped_reads"] = unmapped_reads
359 "AC": 0, 358 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
360 "TC": 0, 359
361 "GC": 0, 360 if "read_name" in master_read_dict:
362 "CC": 0, 361 del master_read_dict["read_name"]
363 } 362 print ("BAM file processed")
364 363 print ("Creating metagenes, triplet periodicity plots, etc.")
365 for i in range(0, len(seq)): 364
366 if i not in nuc_count_dict["mapped"][readlen]: 365 for tran in master_read_dict:
367 nuc_count_dict["mapped"][readlen][i] = { 366 try:
368 "A": 0, 367 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"])
369 "T": 0, 368 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"])
370 "G": 0, 369 # print(tran, type(cds_start))
371 "C": 0, 370 except:
372 "N": 0, 371 print("Exception: ", tran)
373 } 372 continue
374 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 373
375 374 tranlen = transcriptome_info_dict[tran]["length"]
376 for i in range(0, len(seq)): 375 #Use this to discard transcripts with no 5' leader or 3' trailer
377 try: 376 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1:
378 dinuc_count_dict[readlen][seq[i : i + 2]] += 1 377 for primetype in ["fiveprime", "threeprime"]:
379 except: 378 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
380 pass 379 for readlength in master_read_dict[tran]["unambig"]:
381 380 #print "readlength", readlength
382 for i in range(len(seq), 0, -1): 381 # for each fiveprime postion for this readlength within this transcript
383 dist = i - len(seq) 382 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
384 if dist not in threeprime_nuc_count_dict["mapped"][readlen]: 383 #print "raw pos", raw_pos
385 threeprime_nuc_count_dict["mapped"][readlen][dist] = { 384 trip_periodicity_reads += 1
386 "A": 0, 385 if primetype == "fiveprime":
387 "T": 0, 386 # get the five prime postion minus the cds start postion
388 "G": 0, 387 real_pos = raw_pos-cds_start
389 "C": 0, 388 rel_stop_pos = raw_pos-cds_stop
390 "N": 0, 389 elif primetype == "threeprime":
391 } 390 real_pos = (raw_pos+readlength)-cds_start
392 threeprime_nuc_count_dict["mapped"][readlen][dist][ 391 rel_stop_pos = (raw_pos+readlength)-cds_stop
393 seq[dist] 392 #get the readcount at the raw postion
394 ] += 1 393 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos]
395 ambiguously_mapped_reads += processor( 394 #print "readcount", readcount
396 process_chunk, 395 frame = (real_pos%3)
397 master_read_dict, 396 if real_pos >= cds_start and real_pos <= cds_stop:
398 transcriptome_info_dict, 397 if readlength in master_trip_dict[primetype]:
399 master_dict, 398 master_trip_dict[primetype][readlength][str(frame)] += readcount
400 prev_seq, 399 else:
401 unambig_read_length_dict, 400 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0}
402 ) 401 master_trip_dict[primetype][readlength][str(frame)] += readcount
403 process_chunk = {readname: [[tran, pos, read_tags]]} 402 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
404 prev_seq = read.seq 403 if real_pos > (-600) and real_pos < (601):
405 else: 404 if readlength in master_offset_dict[primetype]:
406 unmapped_reads += 1 405 if real_pos in master_offset_dict[primetype][readlength]:
407 406 #print "real pos in offset dict"
408 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. 407 master_offset_dict[primetype][readlength][real_pos] += readcount
409 seq = read.seq 408 else:
410 readlen = len(seq) 409 #print "real pos not in offset dict"
411 if seq in unmapped_dict: 410 master_offset_dict[primetype][readlength][real_pos] = readcount
412 unmapped_dict[seq] += 1 411 else:
413 else: 412 #initiliase with zero to avoid missing neighbours below
414 unmapped_dict[seq] = 1 413 #print "initialising with zeros"
415 414 master_offset_dict[primetype][readlength]= {}
416 # Populate the nuc_count_dict with this unmapped read 415 for i in range(-600,601):
417 if readlen not in nuc_count_dict["unmapped"]: 416 master_offset_dict[primetype][readlength][i] = 0
418 nuc_count_dict["unmapped"][readlen] = {} 417 master_offset_dict[primetype][readlength][real_pos] += readcount
419 for i in range(0, len(seq)): 418
420 if i not in nuc_count_dict["unmapped"][readlen]: 419 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
421 nuc_count_dict["unmapped"][readlen][i] = { 420 if rel_stop_pos > (-600) and rel_stop_pos < (601):
422 "A": 0, 421 if readlength in master_metagene_stop_dict[primetype]:
423 "T": 0, 422 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]:
424 "G": 0, 423 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
425 "C": 0, 424 else:
426 "N": 0, 425 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount
427 } 426 else:
428 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 427 #initiliase with zero to avoid missing neighbours below
429 428 master_metagene_stop_dict[primetype][readlength] = {}
430 if readlen not in threeprime_nuc_count_dict["unmapped"]: 429 for i in range(-600,601):
431 threeprime_nuc_count_dict["unmapped"][readlen] = {} 430 master_metagene_stop_dict[primetype][readlength][i] = 0
432 431 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount
433 for i in range(len(seq), 0, -1): 432
434 dist = i - len(seq) 433 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
435 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: 434 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
436 threeprime_nuc_count_dict["unmapped"][readlen][dist] = { 435 for primetype in ["fiveprime", "threeprime"]:
437 "A": 0, 436 for subreadlength in master_trip_dict[primetype]:
438 "T": 0, 437 maxcount = 0
439 "G": 0, 438 secondmaxcount = 0
440 "C": 0, 439 for frame in master_trip_dict[primetype][subreadlength]:
441 "N": 0, 440 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
442 } 441 maxcount = master_trip_dict[primetype][subreadlength][frame]
443 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 442 for frame in master_trip_dict[primetype][subreadlength]:
444 443 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount:
445 # add stats about mapped/unmapped reads to file dict which will be used for the final report 444 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
446 master_dict["total_bam_lines"] = total_bam_lines 445 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
447 master_dict["mapped_reads"] = mapped_reads 446 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount)
448 master_dict["unmapped_reads"] = unmapped_reads 447 #This part is to determine what offsets to give each read length
449 master_read_dict["unmapped_reads"] = unmapped_reads 448 print ("Calculating offsets")
450 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads 449 for primetype in ["fiveprime", "threeprime"]:
451 450 for readlen in master_offset_dict[primetype]:
452 if "read_name" in master_read_dict: 451 accepted_len = False
453 del master_read_dict["read_name"] 452 max_relative_pos = 0
454 print("BAM file processed") 453 max_relative_count = 0
455 print("Creating metagenes, triplet periodicity plots, etc.") 454 for relative_pos in master_offset_dict[primetype][readlen]:
456 for tran in master_read_dict: 455 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
457 try: 456 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10):
458 cds_start = transcriptome_info_dict[tran]["cds_start"] 457 continue
459 cds_stop = transcriptome_info_dict[tran]["cds_stop"] 458 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count:
460 except: 459 max_relative_pos = relative_pos
461 continue 460 max_relative_count = master_offset_dict[primetype][readlen][relative_pos]
462 461 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
463 tranlen = transcriptome_info_dict[tran]["length"] 462 if primetype == "fiveprime":
464 # Use this to discard transcripts with no 5' leader or 3' trailer 463 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
465 if ( 464 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2)
466 cds_start > 1 465 elif primetype == "threeprime":
467 and cds_stop < tranlen 466 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
468 and transcriptome_info_dict[tran]["tran_type"] == 1 467 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2
469 ): 468 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
470 for primetype in ["fiveprime", "threeprime"]: 469 if readlen in master_trip_dict[primetype]:
471 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads 470 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"]
472 for readlength in master_read_dict[tran]["unambig"]: 471 else:
473 # print "readlength", readlength 472 final_offsets[primetype]["read_scores"][readlen] = 0.0
474 # for each fiveprime postion for this readlength within this transcript 473
475 for raw_pos in master_read_dict[tran]["unambig"][readlength]: 474
476 # print "raw pos", raw_pos 475 master_read_dict["unmapped_reads"] = unmapped_reads
477 trip_periodicity_reads += 1 476 master_read_dict["offsets"] = final_offsets
478 if primetype == "fiveprime": 477 master_read_dict["trip_periodicity"] = master_trip_dict
479 # get the five prime postion minus the cds start postion 478 master_read_dict["desc"] = "Null"
480 real_pos = raw_pos - cds_start 479 master_read_dict["mapped_reads"] = mapped_reads
481 rel_stop_pos = raw_pos - cds_stop 480 master_read_dict["nuc_counts"] = nuc_count_dict
482 elif primetype == "threeprime": 481 master_read_dict["dinuc_counts"] = dinuc_count_dict
483 real_pos = (raw_pos + readlength) - cds_start 482 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
484 rel_stop_pos = (raw_pos + readlength) - cds_stop 483 master_read_dict["metagene_counts"] = master_offset_dict
485 # get the readcount at the raw postion 484 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
486 readcount = master_read_dict[tran]["unambig"][readlength][ 485 master_read_dict["read_lengths"] = read_length_dict
487 raw_pos 486 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
488 ] 487 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
489 # print "readcount", readcount 488 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
490 frame = real_pos % 3 489 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
491 if real_pos >= cds_start and real_pos <= cds_stop: 490 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:]
492 if readlength in master_trip_dict[primetype]: 491 master_read_dict["cutadapt_removed"] = 0
493 master_trip_dict[primetype][readlength][ 492 master_read_dict["rrna_removed"] = 0
494 str(frame) 493 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
495 ] += readcount 494 master_read_dict["removed_minus_m"] = 0
496 else: 495 master_dict["removed_minus_m"] = 0
497 master_trip_dict[primetype][readlength] = { 496 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
498 "0": 0.0, 497 master_read_dict["unambiguous_all_totals"] = {}
499 "1": 0.0, 498 master_read_dict["unambiguous_fiveprime_totals"] = {}
500 "2": 0.0, 499 master_read_dict["unambiguous_cds_totals"] = {}
501 } 500 master_read_dict["unambiguous_threeprime_totals"] = {}
502 master_trip_dict[primetype][readlength][ 501
503 str(frame) 502 master_read_dict["ambiguous_all_totals"] = {}
504 ] += readcount 503 master_read_dict["ambiguous_fiveprime_totals"] = {}
505 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict 504 master_read_dict["ambiguous_cds_totals"] = {}
506 if real_pos > (-600) and real_pos < (601): 505 master_read_dict["ambiguous_threeprime_totals"] = {}
507 if readlength in master_offset_dict[primetype]: 506 print ("calculating transcript counts")
508 if ( 507 for tran in master_read_dict:
509 real_pos 508 if tran in transcriptome_info_dict:
510 in master_offset_dict[primetype][readlength] 509 five_total = 0
511 ): 510 cds_total = 0
512 # print "real pos in offset dict" 511 three_total = 0
513 master_offset_dict[primetype][readlength][ 512
514 real_pos 513 ambig_five_total = 0
515 ] += readcount 514 ambig_cds_total = 0
516 else: 515 ambig_three_total = 0
517 # print "real pos not in offset dict" 516
518 master_offset_dict[primetype][readlength][ 517 cds_start = transcriptome_info_dict[tran]["cds_start"]
519 real_pos 518 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
520 ] = readcount 519
521 else: 520 for readlen in master_read_dict[tran]["unambig"]:
522 # initiliase with zero to avoid missing neighbours below 521 if readlen in final_offsets["fiveprime"]["offsets"]:
523 # print "initialising with zeros" 522 offset = final_offsets["fiveprime"]["offsets"][readlen]
524 master_offset_dict[primetype][readlength] = {} 523 else:
525 for i in range(-600, 601): 524 offset = 15
526 master_offset_dict[primetype][readlength][i] = 0 525 for pos in master_read_dict[tran]["unambig"][readlen]:
527 master_offset_dict[primetype][readlength][ 526 real_pos = pos+offset
528 real_pos 527 if cds_start is None or cds_stop is None:
529 ] += readcount 528 three_total += master_read_dict[tran]["unambig"][readlen][pos]
530 529 else:
531 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict 530 if real_pos <cds_start:
532 if rel_stop_pos > (-600) and rel_stop_pos < (601): 531 five_total += master_read_dict[tran]["unambig"][readlen][pos]
533 if readlength in master_metagene_stop_dict[primetype]: 532 elif real_pos >=cds_start and real_pos <= cds_stop:
534 if ( 533 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
535 rel_stop_pos 534 elif real_pos > cds_stop:
536 in master_metagene_stop_dict[primetype][readlength] 535 three_total += master_read_dict[tran]["unambig"][readlen][pos]
537 ): 536 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total
538 master_metagene_stop_dict[primetype][readlength][ 537 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
539 rel_stop_pos 538 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
540 ] += readcount 539 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
541 else: 540
542 master_metagene_stop_dict[primetype][readlength][ 541 for readlen in master_read_dict[tran]["ambig"]:
543 rel_stop_pos 542 if readlen in final_offsets["fiveprime"]["offsets"]:
544 ] = readcount 543 offset = final_offsets["fiveprime"]["offsets"][readlen]
545 else: 544 else:
546 # initiliase with zero to avoid missing neighbours below 545 offset = 15
547 master_metagene_stop_dict[primetype][readlength] = {} 546 for pos in master_read_dict[tran]["ambig"][readlen]:
548 for i in range(-600, 601): 547 if cds_start is None or cds_stop is None:
549 master_metagene_stop_dict[primetype][readlength][ 548 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
550 i 549 else:
551 ] = 0 550 real_pos = pos+offset
552 master_metagene_stop_dict[primetype][readlength][ 551 if real_pos < cds_start:
553 rel_stop_pos 552 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos]
554 ] += readcount 553 elif real_pos >=cds_start and real_pos <= cds_stop:
555 554 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
556 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame 555 elif real_pos > cds_stop:
557 # create a 'score' for each readlength by putting the max frame count over the second highest frame count 556 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos]
558 for primetype in ["fiveprime", "threeprime"]: 557
559 for subreadlength in master_trip_dict[primetype]: 558 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total
560 maxcount = 0 559 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total
561 secondmaxcount = 0 560 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total
562 for frame in master_trip_dict[primetype][subreadlength]: 561 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total
563 if master_trip_dict[primetype][subreadlength][frame] > maxcount: 562
564 maxcount = master_trip_dict[primetype][subreadlength][frame] 563 print ("Writing out to sqlite file")
565 for frame in master_trip_dict[primetype][subreadlength]: 564 sqlite_db = SqliteDict(outputfile,autocommit=False)
566 if ( 565 for key in master_read_dict:
567 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount 566 sqlite_db[key] = master_read_dict[key]
568 and master_trip_dict[primetype][subreadlength][frame] != maxcount 567 sqlite_db["description"] = desc
569 ): 568 sqlite_db.commit()
570 secondmaxcount = master_trip_dict[primetype][subreadlength][frame] 569 sqlite_db.close()
571 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
572 master_trip_dict[primetype][subreadlength]["score"] = float(
573 secondmaxcount
574 ) / float(maxcount)
575 # This part is to determine what offsets to give each read length
576 print("Calculating offsets")
577 for primetype in ["fiveprime", "threeprime"]:
578 for readlen in master_offset_dict[primetype]:
579 accepted_len = False
580 max_relative_pos = 0
581 max_relative_count = 0
582 for relative_pos in master_offset_dict[primetype][readlen]:
583 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
584 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10):
585 continue
586 if (
587 master_offset_dict[primetype][readlen][relative_pos]
588 > max_relative_count
589 ):
590 max_relative_pos = relative_pos
591 max_relative_count = master_offset_dict[primetype][readlen][
592 relative_pos
593 ]
594 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
595 if primetype == "fiveprime":
596 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
597 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2)
598 elif primetype == "threeprime":
599 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
600 final_offsets[primetype]["offsets"][readlen] = (
601 max_relative_pos * (-1)
602 ) + 2
603 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
604 if readlen in master_trip_dict[primetype]:
605 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[
606 primetype
607 ][readlen]["score"]
608 else:
609 final_offsets[primetype]["read_scores"][readlen] = 0.0
610 master_read_dict["offsets"] = final_offsets
611 master_read_dict["trip_periodicity"] = master_trip_dict
612 master_read_dict["desc"] = "Null"
613 master_read_dict["mapped_reads"] = mapped_reads
614 master_read_dict["nuc_counts"] = nuc_count_dict
615 master_read_dict["dinuc_counts"] = dinuc_count_dict
616 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
617 master_read_dict["metagene_counts"] = master_offset_dict
618 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
619 master_read_dict["read_lengths"] = read_length_dict
620 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
621 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
622 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
623 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
624 master_read_dict["frequent_unmapped_reads"] = (
625 sorted(unmapped_dict.items(), key=operator.itemgetter(1))
626 )[-2000:]
627 master_read_dict["cutadapt_removed"] = 0
628 master_read_dict["rrna_removed"] = 0
629 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
630 master_read_dict["removed_minus_m"] = 0
631 master_dict["removed_minus_m"] = 0
632 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
633 master_read_dict["unambiguous_all_totals"] = {}
634 master_read_dict["unambiguous_fiveprime_totals"] = {}
635 master_read_dict["unambiguous_cds_totals"] = {}
636 master_read_dict["unambiguous_threeprime_totals"] = {}
637
638 master_read_dict["ambiguous_all_totals"] = {}
639 master_read_dict["ambiguous_fiveprime_totals"] = {}
640 master_read_dict["ambiguous_cds_totals"] = {}
641 master_read_dict["ambiguous_threeprime_totals"] = {}
642 print("calculating transcript counts")
643 for tran in master_read_dict:
644 if tran in transcriptome_info_dict:
645 five_total = 0
646 cds_total = 0
647 three_total = 0
648
649 ambig_five_total = 0
650 ambig_cds_total = 0
651 ambig_three_total = 0
652
653 cds_start = transcriptome_info_dict[tran]["cds_start"]
654 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
655 for readlen in master_read_dict[tran]["unambig"]:
656 if readlen in final_offsets["fiveprime"]["offsets"]:
657 offset = final_offsets["fiveprime"]["offsets"][readlen]
658 else:
659 offset = 15
660 for pos in master_read_dict[tran]["unambig"][readlen]:
661 real_pos = pos + offset
662 if real_pos < cds_start:
663 five_total += master_read_dict[tran]["unambig"][readlen][pos]
664 elif real_pos >= cds_start and real_pos <= cds_stop:
665 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
666 elif real_pos > cds_stop:
667 three_total += master_read_dict[tran]["unambig"][readlen][pos]
668 master_read_dict["unambiguous_all_totals"][tran] = (
669 five_total + cds_total + three_total
670 )
671 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
672 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
673 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
674
675 for readlen in master_read_dict[tran]["ambig"]:
676 if readlen in final_offsets["fiveprime"]["offsets"]:
677 offset = final_offsets["fiveprime"]["offsets"][readlen]
678 else:
679 offset = 15
680 for pos in master_read_dict[tran]["ambig"][readlen]:
681 real_pos = pos + offset
682 if real_pos < cds_start:
683 ambig_five_total += master_read_dict[tran]["ambig"][readlen][
684 pos
685 ]
686 elif real_pos >= cds_start and real_pos <= cds_stop:
687 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
688 elif real_pos > cds_stop:
689 ambig_three_total += master_read_dict[tran]["ambig"][readlen][
690 pos
691 ]
692
693 master_read_dict["ambiguous_all_totals"][tran] = (
694 five_total
695 + cds_total
696 + three_total
697 + ambig_five_total
698 + ambig_cds_total
699 + ambig_three_total
700 )
701 master_read_dict["ambiguous_fiveprime_totals"][tran] = (
702 five_total + ambig_five_total
703 )
704 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total
705 master_read_dict["ambiguous_threeprime_totals"][tran] = (
706 three_total + ambig_three_total
707 )
708 print("Writing out to sqlite file")
709 sqlite_db = SqliteDict(outputfile, autocommit=False)
710 for key in master_read_dict:
711 sqlite_db[key] = master_read_dict[key]
712 sqlite_db["description"] = desc
713 sqlite_db.commit()
714 sqlite_db.close()
715 570
716 571
717 if __name__ == "__main__": 572 if __name__ == "__main__":
718 if len(sys.argv) <= 2: 573 if len(sys.argv) <= 2:
719 print( 574 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>")
720 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>" 575 sys.exit()
721 ) 576 bam_filepath = sys.argv[1]
722 sys.exit() 577 annotation_sqlite_filepath = sys.argv[2]
723 bam_filepath = sys.argv[1] 578 #try:
724 annotation_sqlite_filepath = sys.argv[2] 579 # desc = sys.argv[3]
725 try: 580 #except:
726 desc = sys.argv[3] 581 # desc = bam_filepath.split("/")[-1]
727 except: 582 outputfile = bam_filepath+"v2.sqlite"
728 desc = bam_filepath.split("/")[-1] 583 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile)
729
730 outputfile = sys.argv[4]
731 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile, desc)