Mercurial > repos > jackcurragh > trips_viz_bam_to_sqlite
comparison trips_bam_to_sqlite/bam_to_sqlite.py @ 9:0e88342d5794 draft
Uploaded
author | jackcurragh |
---|---|
date | Tue, 25 Oct 2022 08:10:50 +0000 |
parents | 2c6f630c732f |
children | 3d2b1532a1b0 |
comparison
equal
deleted
inserted
replaced
8:0fcffa7365b9 | 9:0e88342d5794 |
---|---|
4 import os | 4 import os |
5 import time | 5 import time |
6 import sqlite3 | 6 import sqlite3 |
7 from sqlitedict import SqliteDict | 7 from sqlitedict import SqliteDict |
8 | 8 |
9 | |
10 def tran_to_genome(tran, pos, transcriptome_info_dict): | 9 def tran_to_genome(tran, pos, transcriptome_info_dict): |
11 # print ("tran",list(transcriptome_info_dict)) | 10 #print ("tran",list(transcriptome_info_dict)) |
12 traninfo = transcriptome_info_dict[tran] | 11 traninfo = transcriptome_info_dict[tran] |
13 chrom = traninfo["chrom"] | 12 chrom = traninfo["chrom"] |
14 strand = traninfo["strand"] | 13 strand = traninfo["strand"] |
15 exons = sorted(traninfo["exons"]) | 14 exons = sorted(traninfo["exons"]) |
16 # print exons | 15 #print exons |
17 if strand == "+": | 16 if strand == "+": |
18 exon_start = 0 | 17 exon_start = 0 |
19 for tup in exons: | 18 for tup in exons: |
20 exon_start = tup[0] | 19 exon_start = tup[0] |
21 exonlen = tup[1] - tup[0] | 20 exonlen = tup[1] - tup[0] |
22 if pos > exonlen: | 21 if pos > exonlen: |
23 pos = (pos - exonlen) - 1 | 22 pos = (pos - exonlen)-1 |
24 else: | 23 else: |
25 break | 24 break |
26 genomic_pos = (exon_start + pos) - 1 | 25 genomic_pos = (exon_start+pos)-1 |
27 elif strand == "-": | 26 elif strand == "-": |
28 exon_start = 0 | 27 exon_start = 0 |
29 for tup in exons[::-1]: | 28 for tup in exons[::-1]: |
30 exon_start = tup[1] | 29 exon_start = tup[1] |
31 exonlen = tup[1] - tup[0] | 30 exonlen = tup[1] - tup[0] |
32 if pos > exonlen: | 31 if pos > exonlen: |
33 pos = (pos - exonlen) - 1 | 32 pos = (pos - exonlen)-1 |
34 else: | 33 else: |
35 break | 34 break |
36 genomic_pos = (exon_start - pos) + 1 | 35 genomic_pos = (exon_start-pos)+1 |
37 return (chrom, genomic_pos) | 36 return (chrom, genomic_pos) |
38 | 37 |
39 | 38 |
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript | 39 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript |
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict | 40 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict |
42 def processor( | 41 def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict): |
43 process_chunk, | 42 readlen = len(readseq) |
44 master_read_dict, | 43 ambiguously_mapped_reads = 0 |
45 transcriptome_info_dict, | 44 #get the read name |
46 master_dict, | 45 read = list(process_chunk)[0] |
47 readseq, | 46 |
48 unambig_read_length_dict, | 47 read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions |
49 ): | 48 #used to store different genomic poistions |
50 readlen = len(readseq) | 49 genomic_positions = [] |
51 ambiguously_mapped_reads = 0 | 50 |
52 # get the read name | 51 #This section is just to get the different genomic positions the read aligns to |
53 read = list(process_chunk)[0] | 52 |
54 | 53 for listname in process_chunk[read]: |
55 read_list = process_chunk[ | 54 |
56 read | 55 tran = listname[0].replace("-","_").replace("(","").replace(")","") |
57 ] # a list of lists of all transcripts the read aligns to and the positions | 56 |
58 # used to store different genomic poistions | 57 pos = int(listname[1]) |
59 genomic_positions = [] | 58 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) |
60 | 59 #print ("genomic pos",genomic_pos) |
61 # This section is just to get the different genomic positions the read aligns to | 60 if genomic_pos not in genomic_positions: |
62 | 61 genomic_positions.append(genomic_pos) |
63 for listname in process_chunk[read]: | 62 |
64 | 63 #If the read maps unambiguously |
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | 64 if len(genomic_positions) == 1: |
66 | 65 if readlen not in unambig_read_length_dict: |
67 pos = int(listname[1]) | 66 unambig_read_length_dict[readlen] = 0 |
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) | 67 unambig_read_length_dict[readlen] += 1 |
69 # print ("genomic pos",genomic_pos) | 68 #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True |
70 if genomic_pos not in genomic_positions: | 69 coding=False |
71 genomic_positions.append(genomic_pos) | 70 |
72 | 71 # For each transcript this read alings to |
73 # If the read maps unambiguously | 72 for listname in process_chunk[read]: |
74 if len(genomic_positions) == 1: | 73 #get the transcript name |
75 if readlen not in unambig_read_length_dict: | 74 tran = listname[0].replace("-","_").replace("(","").replace(")","") |
76 unambig_read_length_dict[readlen] = 0 | 75 #If we haven't come across this transcript already then add to master_read_dict |
77 unambig_read_length_dict[readlen] += 1 | 76 if tran not in master_read_dict: |
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True | 77 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} |
79 coding = False | 78 #get the raw unedited positon, and read tags |
80 | 79 pos = int(listname[1]) |
81 # For each transcript this read alings to | 80 read_tags = listname[2] |
82 for listname in process_chunk[read]: | 81 #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary |
83 # get the transcript name | 82 nm_tag = 0 |
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | 83 |
85 # If we haven't come across this transcript already then add to master_read_dict | 84 for tag in read_tags: |
86 if tran not in master_read_dict: | 85 if tag[0] == "NM": |
87 master_read_dict[tran] = { | 86 nm_tag = int(tag[1]) |
88 "ambig": {}, | 87 if nm_tag > 0: |
89 "unambig": {}, | 88 md_tag = "" |
90 "mismatches": {}, | 89 for tag in read_tags: |
91 "seq": {}, | 90 if tag[0] == "MD": |
92 } | 91 md_tag = tag[1] |
93 # get the raw unedited positon, and read tags | 92 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) |
94 pos = int(listname[1]) | 93 # Count the mismatches (we only do this for unambiguous) |
95 read_tags = listname[2] | 94 for mismatch in mismatches: |
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary | 95 #Ignore mismatches appearing in the first position (due to non templated addition) |
97 nm_tag = 0 | 96 if mismatch != 0: |
98 | 97 char = mismatches[mismatch] |
99 for tag in read_tags: | 98 mismatch_pos = pos + mismatch |
100 if tag[0] == "NM": | 99 if mismatch_pos not in master_read_dict[tran]["seq"]: |
101 nm_tag = int(tag[1]) | 100 master_read_dict[tran]["seq"][mismatch_pos] = {} |
102 if nm_tag > 0: | 101 if char not in master_read_dict[tran]["seq"][mismatch_pos]: |
103 md_tag = "" | 102 master_read_dict[tran]["seq"][mismatch_pos][char] = 0 |
104 for tag in read_tags: | 103 master_read_dict[tran]["seq"][mismatch_pos][char] += 1 |
105 if tag[0] == "MD": | 104 # apply the modifiers |
106 md_tag = tag[1] | 105 #pos = pos+pos_modifier |
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( | 106 #readlen = readlen - readlen_modifier |
108 md_tag, pos, readlen, master_read_dict, tran, readseq | 107 |
109 ) | 108 |
110 # Count the mismatches (we only do this for unambiguous) | 109 try: |
111 for mismatch in mismatches: | 110 cds_start = transcriptome_info_dict[tran]["cds_start"] |
112 # Ignore mismatches appearing in the first position (due to non templated addition) | 111 cds_stop = transcriptome_info_dict[tran]["cds_stop"] |
113 if mismatch != 0: | 112 |
114 char = mismatches[mismatch] | 113 if pos >= cds_start and pos <= cds_stop: |
115 mismatch_pos = pos + mismatch | 114 coding=True |
116 if mismatch_pos not in master_read_dict[tran]["seq"]: | 115 except: |
117 master_read_dict[tran]["seq"][mismatch_pos] = {} | 116 pass |
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]: | 117 |
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0 | 118 |
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1 | 119 if readlen in master_read_dict[tran]["unambig"]: |
121 # apply the modifiers | 120 if pos in master_read_dict[tran]["unambig"][readlen]: |
122 # pos = pos+pos_modifier | 121 master_read_dict[tran]["unambig"][readlen][pos] += 1 |
123 # readlen = readlen - readlen_modifier | 122 else: |
124 | 123 master_read_dict[tran]["unambig"][readlen][pos] = 1 |
125 try: | 124 else: |
126 cds_start = transcriptome_info_dict[tran]["cds_start"] | 125 master_read_dict[tran]["unambig"][readlen] = {pos:1} |
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | 126 |
128 | 127 if coding == True: |
129 if pos >= cds_start and pos <= cds_stop: | 128 master_dict["unambiguous_coding_count"] += 1 |
130 coding = True | 129 elif coding == False: |
131 except: | 130 master_dict["unambiguous_non_coding_count"] += 1 |
132 pass | 131 |
133 | 132 else: |
134 if readlen in master_read_dict[tran]["unambig"]: | 133 ambiguously_mapped_reads += 1 |
135 if pos in master_read_dict[tran]["unambig"][readlen]: | 134 for listname in process_chunk[read]: |
136 master_read_dict[tran]["unambig"][readlen][pos] += 1 | 135 tran = listname[0].replace("-","_").replace("(","").replace(")","") |
137 else: | 136 if tran not in master_read_dict: |
138 master_read_dict[tran]["unambig"][readlen][pos] = 1 | 137 master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} |
139 else: | 138 pos = int(listname[1]) |
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1} | 139 read_tags = listname[2] |
141 | 140 nm_tag = 0 |
142 if coding == True: | 141 for tag in read_tags: |
143 master_dict["unambiguous_coding_count"] += 1 | 142 if tag[0] == "NM": |
144 elif coding == False: | 143 nm_tag = int(tag[1]) |
145 master_dict["unambiguous_non_coding_count"] += 1 | 144 if nm_tag > 0: |
146 | 145 md_tag = "" |
147 else: | 146 for tag in read_tags: |
148 ambiguously_mapped_reads += 1 | 147 if tag[0] == "MD": |
149 for listname in process_chunk[read]: | 148 md_tag = tag[1] |
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "") | 149 pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) |
151 if tran not in master_read_dict: | 150 # apply the modifiers |
152 master_read_dict[tran] = { | 151 #pos = pos+pos_modifier |
153 "ambig": {}, | 152 #readlen = readlen - readlen_modifier |
154 "unambig": {}, | 153 if readlen in master_read_dict[tran]["ambig"]: |
155 "mismatches": {}, | 154 if pos in master_read_dict[tran]["ambig"][readlen]: |
156 "seq": {}, | 155 master_read_dict[tran]["ambig"][readlen][pos] += 1 |
157 } | 156 else: |
158 pos = int(listname[1]) | 157 master_read_dict[tran]["ambig"][readlen][pos] = 1 |
159 read_tags = listname[2] | 158 else: |
160 nm_tag = 0 | 159 master_read_dict[tran]["ambig"][readlen] = {pos:1} |
161 for tag in read_tags: | 160 return ambiguously_mapped_reads |
162 if tag[0] == "NM": | 161 |
163 nm_tag = int(tag[1]) | 162 |
164 if nm_tag > 0: | 163 def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq): |
165 md_tag = "" | 164 nucs = ["A","T","G","C"] |
166 for tag in read_tags: | 165 mismatches = {} |
167 if tag[0] == "MD": | 166 total_so_far = 0 |
168 md_tag = tag[1] | 167 prev_char = "" |
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos( | 168 for char in md_tag: |
170 md_tag, pos, readlen, master_read_dict, tran, readseq | 169 if char in nucs: |
171 ) | 170 if prev_char != "": |
172 # apply the modifiers | 171 total_so_far += int(prev_char) |
173 # pos = pos+pos_modifier | 172 prev_char = "" |
174 # readlen = readlen - readlen_modifier | 173 mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)]) |
175 if readlen in master_read_dict[tran]["ambig"]: | 174 else: |
176 if pos in master_read_dict[tran]["ambig"][readlen]: | 175 if char != "^" and char != "N": |
177 master_read_dict[tran]["ambig"][readlen][pos] += 1 | 176 if prev_char == "": |
178 else: | 177 prev_char = char |
179 master_read_dict[tran]["ambig"][readlen][pos] = 1 | 178 else: |
180 else: | 179 total_so_far += int(prev_char+char) |
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1} | 180 prev_char = "" |
182 return ambiguously_mapped_reads | 181 readlen_modifier = 0 |
183 | 182 pos_modifier = 0 |
184 | 183 five_ok = False |
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq): | 184 three_ok = False |
186 nucs = ["A", "T", "G", "C"] | 185 while five_ok == False: |
187 mismatches = {} | 186 for i in range(0,readlen): |
188 total_so_far = 0 | 187 if i in mismatches: |
189 prev_char = "" | 188 pos_modifier += 1 |
190 for char in md_tag: | 189 readlen_modifier += 1 |
191 if char in nucs: | 190 else: |
192 if prev_char != "": | 191 five_ok = True |
193 total_so_far += int(prev_char) | 192 break |
194 prev_char = "" | 193 five_ok = True |
195 mismatches[total_so_far + len(mismatches)] = readseq[ | 194 |
196 total_so_far + len(mismatches) | 195 |
197 ] | 196 while three_ok == False: |
198 else: | 197 for i in range(readlen-1,0,-1): |
199 if char != "^" and char != "N": | 198 if i in mismatches: |
200 if prev_char == "": | 199 readlen_modifier += 1 |
201 prev_char = char | 200 else: |
202 else: | 201 three_ok = True |
203 total_so_far += int(prev_char + char) | 202 break |
204 prev_char = "" | 203 three_ok = True |
205 readlen_modifier = 0 | 204 |
206 pos_modifier = 0 | 205 |
207 five_ok = False | 206 return (pos_modifier, readlen_modifier, mismatches) |
208 three_ok = False | 207 |
209 while five_ok == False: | 208 |
210 for i in range(0, readlen): | 209 |
211 if i in mismatches: | 210 def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile): |
212 pos_modifier += 1 | 211 desc = "NULL" |
213 readlen_modifier += 1 | 212 start_time = time.time() |
214 else: | 213 study_dict ={} |
215 five_ok = True | 214 nuc_count_dict = {"mapped":{},"unmapped":{}} |
216 break | 215 dinuc_count_dict = {} |
217 five_ok = True | 216 threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}} |
218 | 217 read_length_dict = {} |
219 while three_ok == False: | 218 unambig_read_length_dict = {} |
220 for i in range(readlen - 1, 0, -1): | 219 unmapped_dict = {} |
221 if i in mismatches: | 220 master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()} |
222 readlen_modifier += 1 | 221 |
223 else: | 222 transcriptome_info_dict = {} |
224 three_ok = True | 223 connection = sqlite3.connect(transcriptome_info_dict_path) |
225 break | 224 cursor = connection.cursor() |
226 three_ok = True | 225 cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;") |
227 | 226 result = cursor.fetchall() |
228 return (pos_modifier, readlen_modifier, mismatches) | 227 for row in result: |
229 | 228 transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]} |
230 | 229 #print list(transcriptome_info_dict)[:10] |
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile, desc): | 230 |
232 desc = desc | 231 cursor.execute("SELECT * from exons;") |
233 start_time = time.time() | 232 result = cursor.fetchall() |
234 study_dict = {} | 233 for row in result: |
235 nuc_count_dict = {"mapped": {}, "unmapped": {}} | 234 transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2])) |
236 dinuc_count_dict = {} | 235 |
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}} | 236 #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers |
238 read_length_dict = {} | 237 multimappers = 0 |
239 unambig_read_length_dict = {} | 238 unmapped_reads = 0 |
240 unmapped_dict = {} | 239 unambiguous_coding_count = 0 |
241 master_dict = { | 240 unambiguous_non_coding_count = 0 |
242 "unambiguous_non_coding_count": 0, | 241 trip_periodicity_reads = 0 |
243 "unambiguous_coding_count": 0, | 242 |
244 "current_dir": os.getcwd(), | 243 final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}} |
245 } | 244 master_read_dict = {} |
246 | 245 prev_seq = "" |
247 transcriptome_info_dict = {} | 246 process_chunk = {"read_name":[["placeholder_tran","1","28"]]} |
248 connection = sqlite3.connect(transcriptome_info_dict_path) | 247 mapped_reads = 0 |
249 cursor = connection.cursor() | 248 ambiguously_mapped_reads = 0 |
250 cursor.execute( | 249 master_trip_dict = {"fiveprime":{}, "threeprime":{}} |
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;" | 250 master_offset_dict = {"fiveprime":{}, "threeprime":{}} |
252 ) | 251 master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}} |
253 result = cursor.fetchall() | 252 |
254 for row in result: | 253 pysam.set_verbosity(0) |
255 transcriptome_info_dict[str(row[0])] = { | 254 infile = pysam.Samfile(bam_filepath, "rb") |
256 "cds_start": row[1], | 255 |
257 "cds_stop": row[2], | 256 header = infile.header["HD"] |
258 "length": row[3], | 257 unsorted = False |
259 "strand": row[4], | 258 if "SO" in header: |
260 "chrom": row[5], | 259 if header["SO"] != "queryname": |
261 "exons": [], | 260 unsorted = True |
262 "tran_type": row[6], | 261 else: |
263 } | 262 unsorted = True |
264 # print list(transcriptome_info_dict)[:10] | 263 if unsorted == True: |
265 | 264 print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam") |
266 cursor.execute("SELECT * from exons;") | 265 print (header,bam_filepath) |
267 result = cursor.fetchall() | 266 sys.exit() |
268 for row in result: | 267 total_bam_lines = 0 |
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2])) | 268 all_ref_ids = infile.references |
270 | 269 |
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers | 270 for read in infile.fetch(until_eof=True): |
272 multimappers = 0 | 271 total_bam_lines += 1 |
273 unmapped_reads = 0 | 272 if not read.is_unmapped: |
274 unambiguous_coding_count = 0 | 273 ref = read.reference_id |
275 unambiguous_non_coding_count = 0 | 274 tran = (all_ref_ids[ref]).split(".")[0] |
276 trip_periodicity_reads = 0 | 275 mapped_reads += 1 |
277 | 276 if mapped_reads%1000000 == 0: |
278 final_offsets = { | 277 print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time))) |
279 "fiveprime": {"offsets": {}, "read_scores": {}}, | 278 pos = read.reference_start |
280 "threeprime": {"offsets": {}, "read_scores": {}}, | 279 readname = read.query_name |
281 } | 280 read_tags = read.tags |
282 master_read_dict = {} | 281 if readname == list(process_chunk)[0]: |
283 prev_seq = "" | 282 process_chunk[readname].append([tran,pos,read_tags]) |
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]} | 283 #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read |
285 mapped_reads = 0 | 284 else: |
286 ambiguously_mapped_reads = 0 | 285 if list(process_chunk)[0] != "read_name": |
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}} | 286 |
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}} | 287 #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because |
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}} | 288 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) |
290 | 289 seq = read.seq |
291 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam') | 290 readlen = len(seq) |
292 pysam.set_verbosity(0) | 291 |
293 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") | 292 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) |
294 header = infile.header["HD"] | 293 if readlen not in read_length_dict: |
295 unsorted = False | 294 read_length_dict[readlen] = 0 |
296 if "SO" in header: | 295 read_length_dict[readlen] += 1 |
297 if header["SO"] != "queryname": | 296 |
298 unsorted = True | 297 if readlen not in nuc_count_dict["mapped"]: |
299 else: | 298 nuc_count_dict["mapped"][readlen] = {} |
300 unsorted = True | 299 if readlen not in threeprime_nuc_count_dict["mapped"]: |
301 if unsorted == True: | 300 threeprime_nuc_count_dict["mapped"][readlen] = {} |
302 print( | 301 if readlen not in dinuc_count_dict: |
303 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam" | 302 dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0, |
304 ) | 303 "AT":0, "TT":0, "GT":0, "CT":0, |
305 print(header, bam_filepath) | 304 "AG":0, "TG":0, "GG":0, "CG":0, |
306 sys.exit() | 305 "AC":0, "TC":0, "GC":0, "CC":0} |
307 total_bam_lines = 0 | 306 |
308 all_ref_ids = infile.references | 307 for i in range(0,len(seq)): |
309 | 308 if i not in nuc_count_dict["mapped"][readlen]: |
310 for read in infile.fetch(until_eof=True): | 309 nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} |
311 total_bam_lines += 1 | 310 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 |
312 if not read.is_unmapped: | 311 |
313 ref = read.reference_id | 312 for i in range(0,len(seq)): |
314 tran = (all_ref_ids[ref]).split(".")[0] | 313 try: |
315 mapped_reads += 1 | 314 dinuc_count_dict[readlen][seq[i:i+2]] += 1 |
316 if mapped_reads % 1000000 == 0: | 315 except: |
317 print( | 316 pass |
318 "{} reads parsed at {}".format( | 317 |
319 mapped_reads, (time.time() - start_time) | 318 for i in range(len(seq),0,-1): |
320 ) | 319 dist = i-len(seq) |
321 ) | 320 if dist not in threeprime_nuc_count_dict["mapped"][readlen]: |
322 pos = read.reference_start | 321 threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} |
323 readname = read.query_name | 322 threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1 |
324 read_tags = read.tags | 323 ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict) |
325 if readname == list(process_chunk)[0]: | 324 process_chunk = {readname:[[tran, pos, read_tags]]} |
326 process_chunk[readname].append([tran, pos, read_tags]) | 325 prev_seq = read.seq |
327 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read | 326 else: |
328 else: | 327 unmapped_reads += 1 |
329 if list(process_chunk)[0] != "read_name": | 328 |
330 | 329 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. |
331 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because | 330 seq = read.seq |
332 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) | 331 readlen = len(seq) |
333 seq = read.seq | 332 if seq in unmapped_dict: |
334 readlen = len(seq) | 333 unmapped_dict[seq] += 1 |
335 | 334 else: |
336 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) | 335 unmapped_dict[seq] = 1 |
337 if readlen not in read_length_dict: | 336 |
338 read_length_dict[readlen] = 0 | 337 # Populate the nuc_count_dict with this unmapped read |
339 read_length_dict[readlen] += 1 | 338 if readlen not in nuc_count_dict["unmapped"]: |
340 | 339 nuc_count_dict["unmapped"][readlen] = {} |
341 if readlen not in nuc_count_dict["mapped"]: | 340 for i in range(0,len(seq)): |
342 nuc_count_dict["mapped"][readlen] = {} | 341 if i not in nuc_count_dict["unmapped"][readlen]: |
343 if readlen not in threeprime_nuc_count_dict["mapped"]: | 342 nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} |
344 threeprime_nuc_count_dict["mapped"][readlen] = {} | 343 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 |
345 if readlen not in dinuc_count_dict: | 344 |
346 dinuc_count_dict[readlen] = { | 345 if readlen not in threeprime_nuc_count_dict["unmapped"]: |
347 "AA": 0, | 346 threeprime_nuc_count_dict["unmapped"][readlen] = {} |
348 "TA": 0, | 347 |
349 "GA": 0, | 348 for i in range(len(seq),0,-1): |
350 "CA": 0, | 349 dist = i-len(seq) |
351 "AT": 0, | 350 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: |
352 "TT": 0, | 351 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} |
353 "GT": 0, | 352 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 |
354 "CT": 0, | 353 |
355 "AG": 0, | 354 #add stats about mapped/unmapped reads to file dict which will be used for the final report |
356 "TG": 0, | 355 master_dict["total_bam_lines"] = total_bam_lines |
357 "GG": 0, | 356 master_dict["mapped_reads"] = mapped_reads |
358 "CG": 0, | 357 master_dict["unmapped_reads"] = unmapped_reads |
359 "AC": 0, | 358 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads |
360 "TC": 0, | 359 |
361 "GC": 0, | 360 if "read_name" in master_read_dict: |
362 "CC": 0, | 361 del master_read_dict["read_name"] |
363 } | 362 print ("BAM file processed") |
364 | 363 print ("Creating metagenes, triplet periodicity plots, etc.") |
365 for i in range(0, len(seq)): | 364 |
366 if i not in nuc_count_dict["mapped"][readlen]: | 365 for tran in master_read_dict: |
367 nuc_count_dict["mapped"][readlen][i] = { | 366 try: |
368 "A": 0, | 367 cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"]) |
369 "T": 0, | 368 cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"]) |
370 "G": 0, | 369 # print(tran, type(cds_start)) |
371 "C": 0, | 370 except: |
372 "N": 0, | 371 print("Exception: ", tran) |
373 } | 372 continue |
374 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 | 373 |
375 | 374 tranlen = transcriptome_info_dict[tran]["length"] |
376 for i in range(0, len(seq)): | 375 #Use this to discard transcripts with no 5' leader or 3' trailer |
377 try: | 376 if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1: |
378 dinuc_count_dict[readlen][seq[i : i + 2]] += 1 | 377 for primetype in ["fiveprime", "threeprime"]: |
379 except: | 378 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads |
380 pass | 379 for readlength in master_read_dict[tran]["unambig"]: |
381 | 380 #print "readlength", readlength |
382 for i in range(len(seq), 0, -1): | 381 # for each fiveprime postion for this readlength within this transcript |
383 dist = i - len(seq) | 382 for raw_pos in master_read_dict[tran]["unambig"][readlength]: |
384 if dist not in threeprime_nuc_count_dict["mapped"][readlen]: | 383 #print "raw pos", raw_pos |
385 threeprime_nuc_count_dict["mapped"][readlen][dist] = { | 384 trip_periodicity_reads += 1 |
386 "A": 0, | 385 if primetype == "fiveprime": |
387 "T": 0, | 386 # get the five prime postion minus the cds start postion |
388 "G": 0, | 387 real_pos = raw_pos-cds_start |
389 "C": 0, | 388 rel_stop_pos = raw_pos-cds_stop |
390 "N": 0, | 389 elif primetype == "threeprime": |
391 } | 390 real_pos = (raw_pos+readlength)-cds_start |
392 threeprime_nuc_count_dict["mapped"][readlen][dist][ | 391 rel_stop_pos = (raw_pos+readlength)-cds_stop |
393 seq[dist] | 392 #get the readcount at the raw postion |
394 ] += 1 | 393 readcount = master_read_dict[tran]["unambig"][readlength][raw_pos] |
395 ambiguously_mapped_reads += processor( | 394 #print "readcount", readcount |
396 process_chunk, | 395 frame = (real_pos%3) |
397 master_read_dict, | 396 if real_pos >= cds_start and real_pos <= cds_stop: |
398 transcriptome_info_dict, | 397 if readlength in master_trip_dict[primetype]: |
399 master_dict, | 398 master_trip_dict[primetype][readlength][str(frame)] += readcount |
400 prev_seq, | 399 else: |
401 unambig_read_length_dict, | 400 master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0} |
402 ) | 401 master_trip_dict[primetype][readlength][str(frame)] += readcount |
403 process_chunk = {readname: [[tran, pos, read_tags]]} | 402 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict |
404 prev_seq = read.seq | 403 if real_pos > (-600) and real_pos < (601): |
405 else: | 404 if readlength in master_offset_dict[primetype]: |
406 unmapped_reads += 1 | 405 if real_pos in master_offset_dict[primetype][readlength]: |
407 | 406 #print "real pos in offset dict" |
408 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. | 407 master_offset_dict[primetype][readlength][real_pos] += readcount |
409 seq = read.seq | 408 else: |
410 readlen = len(seq) | 409 #print "real pos not in offset dict" |
411 if seq in unmapped_dict: | 410 master_offset_dict[primetype][readlength][real_pos] = readcount |
412 unmapped_dict[seq] += 1 | 411 else: |
413 else: | 412 #initiliase with zero to avoid missing neighbours below |
414 unmapped_dict[seq] = 1 | 413 #print "initialising with zeros" |
415 | 414 master_offset_dict[primetype][readlength]= {} |
416 # Populate the nuc_count_dict with this unmapped read | 415 for i in range(-600,601): |
417 if readlen not in nuc_count_dict["unmapped"]: | 416 master_offset_dict[primetype][readlength][i] = 0 |
418 nuc_count_dict["unmapped"][readlen] = {} | 417 master_offset_dict[primetype][readlength][real_pos] += readcount |
419 for i in range(0, len(seq)): | 418 |
420 if i not in nuc_count_dict["unmapped"][readlen]: | 419 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict |
421 nuc_count_dict["unmapped"][readlen][i] = { | 420 if rel_stop_pos > (-600) and rel_stop_pos < (601): |
422 "A": 0, | 421 if readlength in master_metagene_stop_dict[primetype]: |
423 "T": 0, | 422 if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]: |
424 "G": 0, | 423 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount |
425 "C": 0, | 424 else: |
426 "N": 0, | 425 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount |
427 } | 426 else: |
428 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 | 427 #initiliase with zero to avoid missing neighbours below |
429 | 428 master_metagene_stop_dict[primetype][readlength] = {} |
430 if readlen not in threeprime_nuc_count_dict["unmapped"]: | 429 for i in range(-600,601): |
431 threeprime_nuc_count_dict["unmapped"][readlen] = {} | 430 master_metagene_stop_dict[primetype][readlength][i] = 0 |
432 | 431 master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount |
433 for i in range(len(seq), 0, -1): | 432 |
434 dist = i - len(seq) | 433 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame |
435 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: | 434 # create a 'score' for each readlength by putting the max frame count over the second highest frame count |
436 threeprime_nuc_count_dict["unmapped"][readlen][dist] = { | 435 for primetype in ["fiveprime", "threeprime"]: |
437 "A": 0, | 436 for subreadlength in master_trip_dict[primetype]: |
438 "T": 0, | 437 maxcount = 0 |
439 "G": 0, | 438 secondmaxcount = 0 |
440 "C": 0, | 439 for frame in master_trip_dict[primetype][subreadlength]: |
441 "N": 0, | 440 if master_trip_dict[primetype][subreadlength][frame] > maxcount: |
442 } | 441 maxcount = master_trip_dict[primetype][subreadlength][frame] |
443 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 | 442 for frame in master_trip_dict[primetype][subreadlength]: |
444 | 443 if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount: |
445 # add stats about mapped/unmapped reads to file dict which will be used for the final report | 444 secondmaxcount = master_trip_dict[primetype][subreadlength][frame] |
446 master_dict["total_bam_lines"] = total_bam_lines | 445 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height |
447 master_dict["mapped_reads"] = mapped_reads | 446 master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount) |
448 master_dict["unmapped_reads"] = unmapped_reads | 447 #This part is to determine what offsets to give each read length |
449 master_read_dict["unmapped_reads"] = unmapped_reads | 448 print ("Calculating offsets") |
450 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads | 449 for primetype in ["fiveprime", "threeprime"]: |
451 | 450 for readlen in master_offset_dict[primetype]: |
452 if "read_name" in master_read_dict: | 451 accepted_len = False |
453 del master_read_dict["read_name"] | 452 max_relative_pos = 0 |
454 print("BAM file processed") | 453 max_relative_count = 0 |
455 print("Creating metagenes, triplet periodicity plots, etc.") | 454 for relative_pos in master_offset_dict[primetype][readlen]: |
456 for tran in master_read_dict: | 455 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) |
457 try: | 456 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10): |
458 cds_start = transcriptome_info_dict[tran]["cds_start"] | 457 continue |
459 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | 458 if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count: |
460 except: | 459 max_relative_pos = relative_pos |
461 continue | 460 max_relative_count = master_offset_dict[primetype][readlen][relative_pos] |
462 | 461 #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) |
463 tranlen = transcriptome_info_dict[tran]["length"] | 462 if primetype == "fiveprime": |
464 # Use this to discard transcripts with no 5' leader or 3' trailer | 463 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall |
465 if ( | 464 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2) |
466 cds_start > 1 | 465 elif primetype == "threeprime": |
467 and cds_stop < tranlen | 466 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall |
468 and transcriptome_info_dict[tran]["tran_type"] == 1 | 467 final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2 |
469 ): | 468 #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict |
470 for primetype in ["fiveprime", "threeprime"]: | 469 if readlen in master_trip_dict[primetype]: |
471 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads | 470 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"] |
472 for readlength in master_read_dict[tran]["unambig"]: | 471 else: |
473 # print "readlength", readlength | 472 final_offsets[primetype]["read_scores"][readlen] = 0.0 |
474 # for each fiveprime postion for this readlength within this transcript | 473 |
475 for raw_pos in master_read_dict[tran]["unambig"][readlength]: | 474 |
476 # print "raw pos", raw_pos | 475 master_read_dict["unmapped_reads"] = unmapped_reads |
477 trip_periodicity_reads += 1 | 476 master_read_dict["offsets"] = final_offsets |
478 if primetype == "fiveprime": | 477 master_read_dict["trip_periodicity"] = master_trip_dict |
479 # get the five prime postion minus the cds start postion | 478 master_read_dict["desc"] = "Null" |
480 real_pos = raw_pos - cds_start | 479 master_read_dict["mapped_reads"] = mapped_reads |
481 rel_stop_pos = raw_pos - cds_stop | 480 master_read_dict["nuc_counts"] = nuc_count_dict |
482 elif primetype == "threeprime": | 481 master_read_dict["dinuc_counts"] = dinuc_count_dict |
483 real_pos = (raw_pos + readlength) - cds_start | 482 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict |
484 rel_stop_pos = (raw_pos + readlength) - cds_stop | 483 master_read_dict["metagene_counts"] = master_offset_dict |
485 # get the readcount at the raw postion | 484 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict |
486 readcount = master_read_dict[tran]["unambig"][readlength][ | 485 master_read_dict["read_lengths"] = read_length_dict |
487 raw_pos | 486 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict |
488 ] | 487 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] |
489 # print "readcount", readcount | 488 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] |
490 frame = real_pos % 3 | 489 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] |
491 if real_pos >= cds_start and real_pos <= cds_stop: | 490 master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:] |
492 if readlength in master_trip_dict[primetype]: | 491 master_read_dict["cutadapt_removed"] = 0 |
493 master_trip_dict[primetype][readlength][ | 492 master_read_dict["rrna_removed"] = 0 |
494 str(frame) | 493 #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line |
495 ] += readcount | 494 master_read_dict["removed_minus_m"] = 0 |
496 else: | 495 master_dict["removed_minus_m"] = 0 |
497 master_trip_dict[primetype][readlength] = { | 496 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets |
498 "0": 0.0, | 497 master_read_dict["unambiguous_all_totals"] = {} |
499 "1": 0.0, | 498 master_read_dict["unambiguous_fiveprime_totals"] = {} |
500 "2": 0.0, | 499 master_read_dict["unambiguous_cds_totals"] = {} |
501 } | 500 master_read_dict["unambiguous_threeprime_totals"] = {} |
502 master_trip_dict[primetype][readlength][ | 501 |
503 str(frame) | 502 master_read_dict["ambiguous_all_totals"] = {} |
504 ] += readcount | 503 master_read_dict["ambiguous_fiveprime_totals"] = {} |
505 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | 504 master_read_dict["ambiguous_cds_totals"] = {} |
506 if real_pos > (-600) and real_pos < (601): | 505 master_read_dict["ambiguous_threeprime_totals"] = {} |
507 if readlength in master_offset_dict[primetype]: | 506 print ("calculating transcript counts") |
508 if ( | 507 for tran in master_read_dict: |
509 real_pos | 508 if tran in transcriptome_info_dict: |
510 in master_offset_dict[primetype][readlength] | 509 five_total = 0 |
511 ): | 510 cds_total = 0 |
512 # print "real pos in offset dict" | 511 three_total = 0 |
513 master_offset_dict[primetype][readlength][ | 512 |
514 real_pos | 513 ambig_five_total = 0 |
515 ] += readcount | 514 ambig_cds_total = 0 |
516 else: | 515 ambig_three_total = 0 |
517 # print "real pos not in offset dict" | 516 |
518 master_offset_dict[primetype][readlength][ | 517 cds_start = transcriptome_info_dict[tran]["cds_start"] |
519 real_pos | 518 cds_stop = transcriptome_info_dict[tran]["cds_stop"] |
520 ] = readcount | 519 |
521 else: | 520 for readlen in master_read_dict[tran]["unambig"]: |
522 # initiliase with zero to avoid missing neighbours below | 521 if readlen in final_offsets["fiveprime"]["offsets"]: |
523 # print "initialising with zeros" | 522 offset = final_offsets["fiveprime"]["offsets"][readlen] |
524 master_offset_dict[primetype][readlength] = {} | 523 else: |
525 for i in range(-600, 601): | 524 offset = 15 |
526 master_offset_dict[primetype][readlength][i] = 0 | 525 for pos in master_read_dict[tran]["unambig"][readlen]: |
527 master_offset_dict[primetype][readlength][ | 526 real_pos = pos+offset |
528 real_pos | 527 if cds_start is None or cds_stop is None: |
529 ] += readcount | 528 three_total += master_read_dict[tran]["unambig"][readlen][pos] |
530 | 529 else: |
531 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict | 530 if real_pos <cds_start: |
532 if rel_stop_pos > (-600) and rel_stop_pos < (601): | 531 five_total += master_read_dict[tran]["unambig"][readlen][pos] |
533 if readlength in master_metagene_stop_dict[primetype]: | 532 elif real_pos >=cds_start and real_pos <= cds_stop: |
534 if ( | 533 cds_total += master_read_dict[tran]["unambig"][readlen][pos] |
535 rel_stop_pos | 534 elif real_pos > cds_stop: |
536 in master_metagene_stop_dict[primetype][readlength] | 535 three_total += master_read_dict[tran]["unambig"][readlen][pos] |
537 ): | 536 master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total |
538 master_metagene_stop_dict[primetype][readlength][ | 537 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total |
539 rel_stop_pos | 538 master_read_dict["unambiguous_cds_totals"][tran] = cds_total |
540 ] += readcount | 539 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total |
541 else: | 540 |
542 master_metagene_stop_dict[primetype][readlength][ | 541 for readlen in master_read_dict[tran]["ambig"]: |
543 rel_stop_pos | 542 if readlen in final_offsets["fiveprime"]["offsets"]: |
544 ] = readcount | 543 offset = final_offsets["fiveprime"]["offsets"][readlen] |
545 else: | 544 else: |
546 # initiliase with zero to avoid missing neighbours below | 545 offset = 15 |
547 master_metagene_stop_dict[primetype][readlength] = {} | 546 for pos in master_read_dict[tran]["ambig"][readlen]: |
548 for i in range(-600, 601): | 547 if cds_start is None or cds_stop is None: |
549 master_metagene_stop_dict[primetype][readlength][ | 548 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] |
550 i | 549 else: |
551 ] = 0 | 550 real_pos = pos+offset |
552 master_metagene_stop_dict[primetype][readlength][ | 551 if real_pos < cds_start: |
553 rel_stop_pos | 552 ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos] |
554 ] += readcount | 553 elif real_pos >=cds_start and real_pos <= cds_stop: |
555 | 554 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] |
556 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame | 555 elif real_pos > cds_stop: |
557 # create a 'score' for each readlength by putting the max frame count over the second highest frame count | 556 ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] |
558 for primetype in ["fiveprime", "threeprime"]: | 557 |
559 for subreadlength in master_trip_dict[primetype]: | 558 master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total |
560 maxcount = 0 | 559 master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total |
561 secondmaxcount = 0 | 560 master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total |
562 for frame in master_trip_dict[primetype][subreadlength]: | 561 master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total |
563 if master_trip_dict[primetype][subreadlength][frame] > maxcount: | 562 |
564 maxcount = master_trip_dict[primetype][subreadlength][frame] | 563 print ("Writing out to sqlite file") |
565 for frame in master_trip_dict[primetype][subreadlength]: | 564 sqlite_db = SqliteDict(outputfile,autocommit=False) |
566 if ( | 565 for key in master_read_dict: |
567 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount | 566 sqlite_db[key] = master_read_dict[key] |
568 and master_trip_dict[primetype][subreadlength][frame] != maxcount | 567 sqlite_db["description"] = desc |
569 ): | 568 sqlite_db.commit() |
570 secondmaxcount = master_trip_dict[primetype][subreadlength][frame] | 569 sqlite_db.close() |
571 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height | |
572 master_trip_dict[primetype][subreadlength]["score"] = float( | |
573 secondmaxcount | |
574 ) / float(maxcount) | |
575 # This part is to determine what offsets to give each read length | |
576 print("Calculating offsets") | |
577 for primetype in ["fiveprime", "threeprime"]: | |
578 for readlen in master_offset_dict[primetype]: | |
579 accepted_len = False | |
580 max_relative_pos = 0 | |
581 max_relative_count = 0 | |
582 for relative_pos in master_offset_dict[primetype][readlen]: | |
583 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) | |
584 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10): | |
585 continue | |
586 if ( | |
587 master_offset_dict[primetype][readlen][relative_pos] | |
588 > max_relative_count | |
589 ): | |
590 max_relative_pos = relative_pos | |
591 max_relative_count = master_offset_dict[primetype][readlen][ | |
592 relative_pos | |
593 ] | |
594 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) | |
595 if primetype == "fiveprime": | |
596 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall | |
597 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2) | |
598 elif primetype == "threeprime": | |
599 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall | |
600 final_offsets[primetype]["offsets"][readlen] = ( | |
601 max_relative_pos * (-1) | |
602 ) + 2 | |
603 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict | |
604 if readlen in master_trip_dict[primetype]: | |
605 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[ | |
606 primetype | |
607 ][readlen]["score"] | |
608 else: | |
609 final_offsets[primetype]["read_scores"][readlen] = 0.0 | |
610 master_read_dict["offsets"] = final_offsets | |
611 master_read_dict["trip_periodicity"] = master_trip_dict | |
612 master_read_dict["desc"] = "Null" | |
613 master_read_dict["mapped_reads"] = mapped_reads | |
614 master_read_dict["nuc_counts"] = nuc_count_dict | |
615 master_read_dict["dinuc_counts"] = dinuc_count_dict | |
616 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict | |
617 master_read_dict["metagene_counts"] = master_offset_dict | |
618 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict | |
619 master_read_dict["read_lengths"] = read_length_dict | |
620 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict | |
621 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] | |
622 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] | |
623 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] | |
624 master_read_dict["frequent_unmapped_reads"] = ( | |
625 sorted(unmapped_dict.items(), key=operator.itemgetter(1)) | |
626 )[-2000:] | |
627 master_read_dict["cutadapt_removed"] = 0 | |
628 master_read_dict["rrna_removed"] = 0 | |
629 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line | |
630 master_read_dict["removed_minus_m"] = 0 | |
631 master_dict["removed_minus_m"] = 0 | |
632 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets | |
633 master_read_dict["unambiguous_all_totals"] = {} | |
634 master_read_dict["unambiguous_fiveprime_totals"] = {} | |
635 master_read_dict["unambiguous_cds_totals"] = {} | |
636 master_read_dict["unambiguous_threeprime_totals"] = {} | |
637 | |
638 master_read_dict["ambiguous_all_totals"] = {} | |
639 master_read_dict["ambiguous_fiveprime_totals"] = {} | |
640 master_read_dict["ambiguous_cds_totals"] = {} | |
641 master_read_dict["ambiguous_threeprime_totals"] = {} | |
642 print("calculating transcript counts") | |
643 for tran in master_read_dict: | |
644 if tran in transcriptome_info_dict: | |
645 five_total = 0 | |
646 cds_total = 0 | |
647 three_total = 0 | |
648 | |
649 ambig_five_total = 0 | |
650 ambig_cds_total = 0 | |
651 ambig_three_total = 0 | |
652 | |
653 cds_start = transcriptome_info_dict[tran]["cds_start"] | |
654 cds_stop = transcriptome_info_dict[tran]["cds_stop"] | |
655 for readlen in master_read_dict[tran]["unambig"]: | |
656 if readlen in final_offsets["fiveprime"]["offsets"]: | |
657 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
658 else: | |
659 offset = 15 | |
660 for pos in master_read_dict[tran]["unambig"][readlen]: | |
661 real_pos = pos + offset | |
662 if real_pos < cds_start: | |
663 five_total += master_read_dict[tran]["unambig"][readlen][pos] | |
664 elif real_pos >= cds_start and real_pos <= cds_stop: | |
665 cds_total += master_read_dict[tran]["unambig"][readlen][pos] | |
666 elif real_pos > cds_stop: | |
667 three_total += master_read_dict[tran]["unambig"][readlen][pos] | |
668 master_read_dict["unambiguous_all_totals"][tran] = ( | |
669 five_total + cds_total + three_total | |
670 ) | |
671 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total | |
672 master_read_dict["unambiguous_cds_totals"][tran] = cds_total | |
673 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total | |
674 | |
675 for readlen in master_read_dict[tran]["ambig"]: | |
676 if readlen in final_offsets["fiveprime"]["offsets"]: | |
677 offset = final_offsets["fiveprime"]["offsets"][readlen] | |
678 else: | |
679 offset = 15 | |
680 for pos in master_read_dict[tran]["ambig"][readlen]: | |
681 real_pos = pos + offset | |
682 if real_pos < cds_start: | |
683 ambig_five_total += master_read_dict[tran]["ambig"][readlen][ | |
684 pos | |
685 ] | |
686 elif real_pos >= cds_start and real_pos <= cds_stop: | |
687 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] | |
688 elif real_pos > cds_stop: | |
689 ambig_three_total += master_read_dict[tran]["ambig"][readlen][ | |
690 pos | |
691 ] | |
692 | |
693 master_read_dict["ambiguous_all_totals"][tran] = ( | |
694 five_total | |
695 + cds_total | |
696 + three_total | |
697 + ambig_five_total | |
698 + ambig_cds_total | |
699 + ambig_three_total | |
700 ) | |
701 master_read_dict["ambiguous_fiveprime_totals"][tran] = ( | |
702 five_total + ambig_five_total | |
703 ) | |
704 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total | |
705 master_read_dict["ambiguous_threeprime_totals"][tran] = ( | |
706 three_total + ambig_three_total | |
707 ) | |
708 print("Writing out to sqlite file") | |
709 sqlite_db = SqliteDict(outputfile, autocommit=False) | |
710 for key in master_read_dict: | |
711 sqlite_db[key] = master_read_dict[key] | |
712 sqlite_db["description"] = desc | |
713 sqlite_db.commit() | |
714 sqlite_db.close() | |
715 | 570 |
716 | 571 |
717 if __name__ == "__main__": | 572 if __name__ == "__main__": |
718 if len(sys.argv) <= 2: | 573 if len(sys.argv) <= 2: |
719 print( | 574 print ("Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>") |
720 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>" | 575 sys.exit() |
721 ) | 576 bam_filepath = sys.argv[1] |
722 sys.exit() | 577 annotation_sqlite_filepath = sys.argv[2] |
723 bam_filepath = sys.argv[1] | 578 #try: |
724 annotation_sqlite_filepath = sys.argv[2] | 579 # desc = sys.argv[3] |
725 try: | 580 #except: |
726 desc = sys.argv[3] | 581 # desc = bam_filepath.split("/")[-1] |
727 except: | 582 outputfile = bam_filepath+"v2.sqlite" |
728 desc = bam_filepath.split("/")[-1] | 583 process_bam(bam_filepath,annotation_sqlite_filepath,outputfile) |
729 | |
730 outputfile = sys.argv[4] | |
731 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile, desc) |