annotate trips_bam_to_sqlite/bam_to_sqlite.py @ 2:c8d8675697c6 draft

Uploaded
author jackcurragh
date Wed, 20 Apr 2022 15:18:00 +0000
parents 3ac12b611d7f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
1 import sys
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
2 import pysam
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
3 import operator
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
4 import os
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
5 import time
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
6 import sqlite3
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
7 from sqlitedict import SqliteDict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
8
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
9
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
10 def tran_to_genome(tran, pos, transcriptome_info_dict):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
11 # print ("tran",list(transcriptome_info_dict))
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
12 traninfo = transcriptome_info_dict[tran]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
13 chrom = traninfo["chrom"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
14 strand = traninfo["strand"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
15 exons = sorted(traninfo["exons"])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
16 # print exons
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
17 if strand == "+":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
18 exon_start = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
19 for tup in exons:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
20 exon_start = tup[0]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
21 exonlen = tup[1] - tup[0]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
22 if pos > exonlen:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
23 pos = (pos - exonlen) - 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
24 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
25 break
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
26 genomic_pos = (exon_start + pos) - 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
27 elif strand == "-":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
28 exon_start = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
29 for tup in exons[::-1]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
30 exon_start = tup[1]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
31 exonlen = tup[1] - tup[0]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
32 if pos > exonlen:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
33 pos = (pos - exonlen) - 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
34 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
35 break
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
36 genomic_pos = (exon_start - pos) + 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
37 return (chrom, genomic_pos)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
38
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
39
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
40 # Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
41 # This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
42 def processor(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
43 process_chunk,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
44 master_read_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
45 transcriptome_info_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
46 master_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
47 readseq,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
48 unambig_read_length_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
49 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
50 readlen = len(readseq)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
51 ambiguously_mapped_reads = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
52 # get the read name
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
53 read = list(process_chunk)[0]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
54
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
55 read_list = process_chunk[
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
56 read
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
57 ] # a list of lists of all transcripts the read aligns to and the positions
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
58 # used to store different genomic poistions
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
59 genomic_positions = []
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
60
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
61 # This section is just to get the different genomic positions the read aligns to
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
62
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
63 for listname in process_chunk[read]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
64
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
65 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
66
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
67 pos = int(listname[1])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
68 genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
69 # print ("genomic pos",genomic_pos)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
70 if genomic_pos not in genomic_positions:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
71 genomic_positions.append(genomic_pos)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
72
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
73 # If the read maps unambiguously
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
74 if len(genomic_positions) == 1:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
75 if readlen not in unambig_read_length_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
76 unambig_read_length_dict[readlen] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
77 unambig_read_length_dict[readlen] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
78 # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
79 coding = False
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
80
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
81 # For each transcript this read alings to
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
82 for listname in process_chunk[read]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
83 # get the transcript name
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
84 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
85 # If we haven't come across this transcript already then add to master_read_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
86 if tran not in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
87 master_read_dict[tran] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
88 "ambig": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
89 "unambig": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
90 "mismatches": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
91 "seq": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
92 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
93 # get the raw unedited positon, and read tags
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
94 pos = int(listname[1])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
95 read_tags = listname[2]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
96 # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
97 nm_tag = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
98
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
99 for tag in read_tags:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
100 if tag[0] == "NM":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
101 nm_tag = int(tag[1])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
102 if nm_tag > 0:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
103 md_tag = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
104 for tag in read_tags:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
105 if tag[0] == "MD":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
106 md_tag = tag[1]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
107 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
108 md_tag, pos, readlen, master_read_dict, tran, readseq
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
109 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
110 # Count the mismatches (we only do this for unambiguous)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
111 for mismatch in mismatches:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
112 # Ignore mismatches appearing in the first position (due to non templated addition)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
113 if mismatch != 0:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
114 char = mismatches[mismatch]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
115 mismatch_pos = pos + mismatch
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
116 if mismatch_pos not in master_read_dict[tran]["seq"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
117 master_read_dict[tran]["seq"][mismatch_pos] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
118 if char not in master_read_dict[tran]["seq"][mismatch_pos]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
119 master_read_dict[tran]["seq"][mismatch_pos][char] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
120 master_read_dict[tran]["seq"][mismatch_pos][char] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
121 # apply the modifiers
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
122 # pos = pos+pos_modifier
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
123 # readlen = readlen - readlen_modifier
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
124
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
125 try:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
126 cds_start = transcriptome_info_dict[tran]["cds_start"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
127 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
128
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
129 if pos >= cds_start and pos <= cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
130 coding = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
131 except:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
132 pass
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
133
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
134 if readlen in master_read_dict[tran]["unambig"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
135 if pos in master_read_dict[tran]["unambig"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
136 master_read_dict[tran]["unambig"][readlen][pos] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
137 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
138 master_read_dict[tran]["unambig"][readlen][pos] = 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
139 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
140 master_read_dict[tran]["unambig"][readlen] = {pos: 1}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
141
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
142 if coding == True:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
143 master_dict["unambiguous_coding_count"] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
144 elif coding == False:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
145 master_dict["unambiguous_non_coding_count"] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
146
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
147 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
148 ambiguously_mapped_reads += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
149 for listname in process_chunk[read]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
150 tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
151 if tran not in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
152 master_read_dict[tran] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
153 "ambig": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
154 "unambig": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
155 "mismatches": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
156 "seq": {},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
157 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
158 pos = int(listname[1])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
159 read_tags = listname[2]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
160 nm_tag = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
161 for tag in read_tags:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
162 if tag[0] == "NM":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
163 nm_tag = int(tag[1])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
164 if nm_tag > 0:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
165 md_tag = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
166 for tag in read_tags:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
167 if tag[0] == "MD":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
168 md_tag = tag[1]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
169 pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
170 md_tag, pos, readlen, master_read_dict, tran, readseq
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
171 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
172 # apply the modifiers
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
173 # pos = pos+pos_modifier
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
174 # readlen = readlen - readlen_modifier
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
175 if readlen in master_read_dict[tran]["ambig"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
176 if pos in master_read_dict[tran]["ambig"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
177 master_read_dict[tran]["ambig"][readlen][pos] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
178 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
179 master_read_dict[tran]["ambig"][readlen][pos] = 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
180 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
181 master_read_dict[tran]["ambig"][readlen] = {pos: 1}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
182 return ambiguously_mapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
183
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
184
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
185 def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
186 nucs = ["A", "T", "G", "C"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
187 mismatches = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
188 total_so_far = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
189 prev_char = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
190 for char in md_tag:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
191 if char in nucs:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
192 if prev_char != "":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
193 total_so_far += int(prev_char)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
194 prev_char = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
195 mismatches[total_so_far + len(mismatches)] = readseq[
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
196 total_so_far + len(mismatches)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
197 ]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
198 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
199 if char != "^" and char != "N":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
200 if prev_char == "":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
201 prev_char = char
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
202 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
203 total_so_far += int(prev_char + char)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
204 prev_char = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
205 readlen_modifier = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
206 pos_modifier = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
207 five_ok = False
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
208 three_ok = False
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
209 while five_ok == False:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
210 for i in range(0, readlen):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
211 if i in mismatches:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
212 pos_modifier += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
213 readlen_modifier += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
214 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
215 five_ok = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
216 break
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
217 five_ok = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
218
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
219 while three_ok == False:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
220 for i in range(readlen - 1, 0, -1):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
221 if i in mismatches:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
222 readlen_modifier += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
223 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
224 three_ok = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
225 break
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
226 three_ok = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
227
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
228 return (pos_modifier, readlen_modifier, mismatches)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
229
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
230
2
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
231 def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile, desc):
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
232 desc = desc
0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
233 start_time = time.time()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
234 study_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
235 nuc_count_dict = {"mapped": {}, "unmapped": {}}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
236 dinuc_count_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
237 threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
238 read_length_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
239 unambig_read_length_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
240 unmapped_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
241 master_dict = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
242 "unambiguous_non_coding_count": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
243 "unambiguous_coding_count": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
244 "current_dir": os.getcwd(),
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
245 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
246
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
247 transcriptome_info_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
248 connection = sqlite3.connect(transcriptome_info_dict_path)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
249 cursor = connection.cursor()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
250 cursor.execute(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
251 "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
252 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
253 result = cursor.fetchall()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
254 for row in result:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
255 transcriptome_info_dict[str(row[0])] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
256 "cds_start": row[1],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
257 "cds_stop": row[2],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
258 "length": row[3],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
259 "strand": row[4],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
260 "chrom": row[5],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
261 "exons": [],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
262 "tran_type": row[6],
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
263 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
264 # print list(transcriptome_info_dict)[:10]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
265
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
266 cursor.execute("SELECT * from exons;")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
267 result = cursor.fetchall()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
268 for row in result:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
269 transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2]))
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
270
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
271 # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
272 multimappers = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
273 unmapped_reads = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
274 unambiguous_coding_count = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
275 unambiguous_non_coding_count = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
276 trip_periodicity_reads = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
277
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
278 final_offsets = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
279 "fiveprime": {"offsets": {}, "read_scores": {}},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
280 "threeprime": {"offsets": {}, "read_scores": {}},
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
281 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
282 master_read_dict = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
283 prev_seq = ""
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
284 process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
285 mapped_reads = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
286 ambiguously_mapped_reads = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
287 master_trip_dict = {"fiveprime": {}, "threeprime": {}}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
288 master_offset_dict = {"fiveprime": {}, "threeprime": {}}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
289 master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
290
1
3ac12b611d7f Uploaded
jackcurragh
parents: 0
diff changeset
291 os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam')
2
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
292 pysam.set_verbosity(0)
1
3ac12b611d7f Uploaded
jackcurragh
parents: 0
diff changeset
293 infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb")
0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
294 header = infile.header["HD"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
295 unsorted = False
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
296 if "SO" in header:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
297 if header["SO"] != "queryname":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
298 unsorted = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
299 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
300 unsorted = True
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
301 if unsorted == True:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
302 print(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
303 "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
304 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
305 print(header, bam_filepath)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
306 sys.exit()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
307 total_bam_lines = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
308 all_ref_ids = infile.references
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
309
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
310 for read in infile.fetch(until_eof=True):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
311 total_bam_lines += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
312 if not read.is_unmapped:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
313 ref = read.reference_id
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
314 tran = (all_ref_ids[ref]).split(".")[0]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
315 mapped_reads += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
316 if mapped_reads % 1000000 == 0:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
317 print(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
318 "{} reads parsed at {}".format(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
319 mapped_reads, (time.time() - start_time)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
320 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
321 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
322 pos = read.reference_start
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
323 readname = read.query_name
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
324 read_tags = read.tags
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
325 if readname == list(process_chunk)[0]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
326 process_chunk[readname].append([tran, pos, read_tags])
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
327 # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
328 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
329 if list(process_chunk)[0] != "read_name":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
330
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
331 # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
332 # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
333 seq = read.seq
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
334 readlen = len(seq)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
335
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
336 # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
337 if readlen not in read_length_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
338 read_length_dict[readlen] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
339 read_length_dict[readlen] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
340
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
341 if readlen not in nuc_count_dict["mapped"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
342 nuc_count_dict["mapped"][readlen] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
343 if readlen not in threeprime_nuc_count_dict["mapped"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
344 threeprime_nuc_count_dict["mapped"][readlen] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
345 if readlen not in dinuc_count_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
346 dinuc_count_dict[readlen] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
347 "AA": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
348 "TA": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
349 "GA": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
350 "CA": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
351 "AT": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
352 "TT": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
353 "GT": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
354 "CT": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
355 "AG": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
356 "TG": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
357 "GG": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
358 "CG": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
359 "AC": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
360 "TC": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
361 "GC": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
362 "CC": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
363 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
364
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
365 for i in range(0, len(seq)):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
366 if i not in nuc_count_dict["mapped"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
367 nuc_count_dict["mapped"][readlen][i] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
368 "A": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
369 "T": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
370 "G": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
371 "C": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
372 "N": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
373 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
374 nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
375
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
376 for i in range(0, len(seq)):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
377 try:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
378 dinuc_count_dict[readlen][seq[i : i + 2]] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
379 except:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
380 pass
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
381
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
382 for i in range(len(seq), 0, -1):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
383 dist = i - len(seq)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
384 if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
385 threeprime_nuc_count_dict["mapped"][readlen][dist] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
386 "A": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
387 "T": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
388 "G": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
389 "C": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
390 "N": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
391 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
392 threeprime_nuc_count_dict["mapped"][readlen][dist][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
393 seq[dist]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
394 ] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
395 ambiguously_mapped_reads += processor(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
396 process_chunk,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
397 master_read_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
398 transcriptome_info_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
399 master_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
400 prev_seq,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
401 unambig_read_length_dict,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
402 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
403 process_chunk = {readname: [[tran, pos, read_tags]]}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
404 prev_seq = read.seq
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
405 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
406 unmapped_reads += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
407
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
408 # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
409 seq = read.seq
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
410 readlen = len(seq)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
411 if seq in unmapped_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
412 unmapped_dict[seq] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
413 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
414 unmapped_dict[seq] = 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
415
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
416 # Populate the nuc_count_dict with this unmapped read
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
417 if readlen not in nuc_count_dict["unmapped"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
418 nuc_count_dict["unmapped"][readlen] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
419 for i in range(0, len(seq)):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
420 if i not in nuc_count_dict["unmapped"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
421 nuc_count_dict["unmapped"][readlen][i] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
422 "A": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
423 "T": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
424 "G": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
425 "C": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
426 "N": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
427 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
428 nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
429
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
430 if readlen not in threeprime_nuc_count_dict["unmapped"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
431 threeprime_nuc_count_dict["unmapped"][readlen] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
432
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
433 for i in range(len(seq), 0, -1):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
434 dist = i - len(seq)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
435 if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
436 threeprime_nuc_count_dict["unmapped"][readlen][dist] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
437 "A": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
438 "T": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
439 "G": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
440 "C": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
441 "N": 0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
442 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
443 threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
444
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
445 # add stats about mapped/unmapped reads to file dict which will be used for the final report
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
446 master_dict["total_bam_lines"] = total_bam_lines
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
447 master_dict["mapped_reads"] = mapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
448 master_dict["unmapped_reads"] = unmapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
449 master_read_dict["unmapped_reads"] = unmapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
450 master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
451
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
452 if "read_name" in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
453 del master_read_dict["read_name"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
454 print("BAM file processed")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
455 print("Creating metagenes, triplet periodicity plots, etc.")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
456 for tran in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
457 try:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
458 cds_start = transcriptome_info_dict[tran]["cds_start"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
459 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
460 except:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
461 continue
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
462
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
463 tranlen = transcriptome_info_dict[tran]["length"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
464 # Use this to discard transcripts with no 5' leader or 3' trailer
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
465 if (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
466 cds_start > 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
467 and cds_stop < tranlen
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
468 and transcriptome_info_dict[tran]["tran_type"] == 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
469 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
470 for primetype in ["fiveprime", "threeprime"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
471 # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
472 for readlength in master_read_dict[tran]["unambig"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
473 # print "readlength", readlength
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
474 # for each fiveprime postion for this readlength within this transcript
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
475 for raw_pos in master_read_dict[tran]["unambig"][readlength]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
476 # print "raw pos", raw_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
477 trip_periodicity_reads += 1
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
478 if primetype == "fiveprime":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
479 # get the five prime postion minus the cds start postion
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
480 real_pos = raw_pos - cds_start
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
481 rel_stop_pos = raw_pos - cds_stop
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
482 elif primetype == "threeprime":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
483 real_pos = (raw_pos + readlength) - cds_start
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
484 rel_stop_pos = (raw_pos + readlength) - cds_stop
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
485 # get the readcount at the raw postion
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
486 readcount = master_read_dict[tran]["unambig"][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
487 raw_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
488 ]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
489 # print "readcount", readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
490 frame = real_pos % 3
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
491 if real_pos >= cds_start and real_pos <= cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
492 if readlength in master_trip_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
493 master_trip_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
494 str(frame)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
495 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
496 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
497 master_trip_dict[primetype][readlength] = {
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
498 "0": 0.0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
499 "1": 0.0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
500 "2": 0.0,
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
501 }
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
502 master_trip_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
503 str(frame)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
504 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
505 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
506 if real_pos > (-600) and real_pos < (601):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
507 if readlength in master_offset_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
508 if (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
509 real_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
510 in master_offset_dict[primetype][readlength]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
511 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
512 # print "real pos in offset dict"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
513 master_offset_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
514 real_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
515 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
516 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
517 # print "real pos not in offset dict"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
518 master_offset_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
519 real_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
520 ] = readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
521 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
522 # initiliase with zero to avoid missing neighbours below
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
523 # print "initialising with zeros"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
524 master_offset_dict[primetype][readlength] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
525 for i in range(-600, 601):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
526 master_offset_dict[primetype][readlength][i] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
527 master_offset_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
528 real_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
529 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
530
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
531 # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
532 if rel_stop_pos > (-600) and rel_stop_pos < (601):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
533 if readlength in master_metagene_stop_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
534 if (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
535 rel_stop_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
536 in master_metagene_stop_dict[primetype][readlength]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
537 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
538 master_metagene_stop_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
539 rel_stop_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
540 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
541 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
542 master_metagene_stop_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
543 rel_stop_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
544 ] = readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
545 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
546 # initiliase with zero to avoid missing neighbours below
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
547 master_metagene_stop_dict[primetype][readlength] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
548 for i in range(-600, 601):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
549 master_metagene_stop_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
550 i
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
551 ] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
552 master_metagene_stop_dict[primetype][readlength][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
553 rel_stop_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
554 ] += readcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
555
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
556 # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
557 # create a 'score' for each readlength by putting the max frame count over the second highest frame count
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
558 for primetype in ["fiveprime", "threeprime"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
559 for subreadlength in master_trip_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
560 maxcount = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
561 secondmaxcount = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
562 for frame in master_trip_dict[primetype][subreadlength]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
563 if master_trip_dict[primetype][subreadlength][frame] > maxcount:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
564 maxcount = master_trip_dict[primetype][subreadlength][frame]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
565 for frame in master_trip_dict[primetype][subreadlength]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
566 if (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
567 master_trip_dict[primetype][subreadlength][frame] > secondmaxcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
568 and master_trip_dict[primetype][subreadlength][frame] != maxcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
569 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
570 secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
571 # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
572 master_trip_dict[primetype][subreadlength]["score"] = float(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
573 secondmaxcount
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
574 ) / float(maxcount)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
575 # This part is to determine what offsets to give each read length
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
576 print("Calculating offsets")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
577 for primetype in ["fiveprime", "threeprime"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
578 for readlen in master_offset_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
579 accepted_len = False
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
580 max_relative_pos = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
581 max_relative_count = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
582 for relative_pos in master_offset_dict[primetype][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
583 # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
584 if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
585 continue
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
586 if (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
587 master_offset_dict[primetype][readlen][relative_pos]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
588 > max_relative_count
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
589 ):
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
590 max_relative_pos = relative_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
591 max_relative_count = master_offset_dict[primetype][readlen][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
592 relative_pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
593 ]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
594 # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
595 if primetype == "fiveprime":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
596 # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
597 final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
598 elif primetype == "threeprime":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
599 # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
600 final_offsets[primetype]["offsets"][readlen] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
601 max_relative_pos * (-1)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
602 ) + 2
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
603 # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
604 if readlen in master_trip_dict[primetype]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
605 final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
606 primetype
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
607 ][readlen]["score"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
608 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
609 final_offsets[primetype]["read_scores"][readlen] = 0.0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
610 master_read_dict["offsets"] = final_offsets
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
611 master_read_dict["trip_periodicity"] = master_trip_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
612 master_read_dict["desc"] = "Null"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
613 master_read_dict["mapped_reads"] = mapped_reads
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
614 master_read_dict["nuc_counts"] = nuc_count_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
615 master_read_dict["dinuc_counts"] = dinuc_count_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
616 master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
617 master_read_dict["metagene_counts"] = master_offset_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
618 master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
619 master_read_dict["read_lengths"] = read_length_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
620 master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
621 master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
622 master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
623 master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
624 master_read_dict["frequent_unmapped_reads"] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
625 sorted(unmapped_dict.items(), key=operator.itemgetter(1))
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
626 )[-2000:]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
627 master_read_dict["cutadapt_removed"] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
628 master_read_dict["rrna_removed"] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
629 # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
630 master_read_dict["removed_minus_m"] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
631 master_dict["removed_minus_m"] = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
632 # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
633 master_read_dict["unambiguous_all_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
634 master_read_dict["unambiguous_fiveprime_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
635 master_read_dict["unambiguous_cds_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
636 master_read_dict["unambiguous_threeprime_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
637
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
638 master_read_dict["ambiguous_all_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
639 master_read_dict["ambiguous_fiveprime_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
640 master_read_dict["ambiguous_cds_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
641 master_read_dict["ambiguous_threeprime_totals"] = {}
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
642 print("calculating transcript counts")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
643 for tran in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
644 if tran in transcriptome_info_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
645 five_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
646 cds_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
647 three_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
648
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
649 ambig_five_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
650 ambig_cds_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
651 ambig_three_total = 0
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
652
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
653 cds_start = transcriptome_info_dict[tran]["cds_start"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
654 cds_stop = transcriptome_info_dict[tran]["cds_stop"]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
655 for readlen in master_read_dict[tran]["unambig"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
656 if readlen in final_offsets["fiveprime"]["offsets"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
657 offset = final_offsets["fiveprime"]["offsets"][readlen]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
658 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
659 offset = 15
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
660 for pos in master_read_dict[tran]["unambig"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
661 real_pos = pos + offset
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
662 if real_pos < cds_start:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
663 five_total += master_read_dict[tran]["unambig"][readlen][pos]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
664 elif real_pos >= cds_start and real_pos <= cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
665 cds_total += master_read_dict[tran]["unambig"][readlen][pos]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
666 elif real_pos > cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
667 three_total += master_read_dict[tran]["unambig"][readlen][pos]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
668 master_read_dict["unambiguous_all_totals"][tran] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
669 five_total + cds_total + three_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
670 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
671 master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
672 master_read_dict["unambiguous_cds_totals"][tran] = cds_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
673 master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
674
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
675 for readlen in master_read_dict[tran]["ambig"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
676 if readlen in final_offsets["fiveprime"]["offsets"]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
677 offset = final_offsets["fiveprime"]["offsets"][readlen]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
678 else:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
679 offset = 15
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
680 for pos in master_read_dict[tran]["ambig"][readlen]:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
681 real_pos = pos + offset
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
682 if real_pos < cds_start:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
683 ambig_five_total += master_read_dict[tran]["ambig"][readlen][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
684 pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
685 ]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
686 elif real_pos >= cds_start and real_pos <= cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
687 ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
688 elif real_pos > cds_stop:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
689 ambig_three_total += master_read_dict[tran]["ambig"][readlen][
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
690 pos
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
691 ]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
692
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
693 master_read_dict["ambiguous_all_totals"][tran] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
694 five_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
695 + cds_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
696 + three_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
697 + ambig_five_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
698 + ambig_cds_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
699 + ambig_three_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
700 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
701 master_read_dict["ambiguous_fiveprime_totals"][tran] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
702 five_total + ambig_five_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
703 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
704 master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
705 master_read_dict["ambiguous_threeprime_totals"][tran] = (
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
706 three_total + ambig_three_total
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
707 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
708 print("Writing out to sqlite file")
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
709 sqlite_db = SqliteDict(outputfile, autocommit=False)
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
710 for key in master_read_dict:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
711 sqlite_db[key] = master_read_dict[key]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
712 sqlite_db["description"] = desc
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
713 sqlite_db.commit()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
714 sqlite_db.close()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
715
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
716
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
717 if __name__ == "__main__":
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
718 if len(sys.argv) <= 2:
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
719 print(
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
720 "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>"
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
721 )
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
722 sys.exit()
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
723 bam_filepath = sys.argv[1]
fef356fa1802 Uploaded
jackcurragh
parents:
diff changeset
724 annotation_sqlite_filepath = sys.argv[2]
2
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
725 try:
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
726 desc = sys.argv[3]
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
727 except:
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
728 desc = bam_filepath.split("/")[-1]
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
729
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
730 outputfile = sys.argv[4]
c8d8675697c6 Uploaded
jackcurragh
parents: 1
diff changeset
731 process_bam(bam_filepath, annotation_sqlite_filepath, outputfile, desc)