# HG changeset patch # User jackcurragh # Date 1667478358 0 # Node ID df4bb52b226d5d3a9253a0614d6fa337df157108 # Parent 02874b1b20155e7167df86d1e8cb94afef669411 Uploaded diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/bam_to_sqlite.py --- a/trips_bam_to_sqlite/bam_to_sqlite.py Wed Nov 02 15:23:28 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,585 +0,0 @@ -import sys -import pysam -import operator -import os -import time -import sqlite3 -from sqlitedict import SqliteDict - -def tran_to_genome(tran, pos, transcriptome_info_dict): - #print ("tran",list(transcriptome_info_dict)) - traninfo = transcriptome_info_dict[tran] - chrom = traninfo["chrom"] - strand = traninfo["strand"] - exons = sorted(traninfo["exons"]) - #print exons - if strand == "+": - exon_start = 0 - for tup in exons: - exon_start = tup[0] - exonlen = tup[1] - tup[0] - if pos > exonlen: - pos = (pos - exonlen)-1 - else: - break - genomic_pos = (exon_start+pos)-1 - elif strand == "-": - exon_start = 0 - for tup in exons[::-1]: - exon_start = tup[1] - exonlen = tup[1] - tup[0] - if pos > exonlen: - pos = (pos - exonlen)-1 - else: - break - genomic_pos = (exon_start-pos)+1 - return (chrom, genomic_pos) - - -# Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript -# This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict -def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict): - readlen = len(readseq) - ambiguously_mapped_reads = 0 - #get the read name - read = list(process_chunk)[0] - - read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions - #used to store different genomic poistions - genomic_positions = [] - - #This section is just to get the different genomic positions the read aligns to - - for listname in process_chunk[read]: - - tran = listname[0].replace("-","_").replace("(","").replace(")","") - - pos = int(listname[1]) - genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) - #print ("genomic pos",genomic_pos) - if genomic_pos not in genomic_positions: - genomic_positions.append(genomic_pos) - - #If the read maps unambiguously - if len(genomic_positions) == 1: - if readlen not in unambig_read_length_dict: - unambig_read_length_dict[readlen] = 0 - unambig_read_length_dict[readlen] += 1 - #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True - coding=False - - # For each transcript this read alings to - for listname in process_chunk[read]: - #get the transcript name - tran = listname[0].replace("-","_").replace("(","").replace(")","") - #If we haven't come across this transcript already then add to master_read_dict - if tran not in master_read_dict: - master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} - #get the raw unedited positon, and read tags - pos = int(listname[1]) - read_tags = listname[2] - #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary - nm_tag = 0 - - for tag in read_tags: - if tag[0] == "NM": - nm_tag = int(tag[1]) - if nm_tag > 0: - md_tag = "" - for tag in read_tags: - if tag[0] == "MD": - md_tag = tag[1] - pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) - # Count the mismatches (we only do this for unambiguous) - for mismatch in mismatches: - #Ignore mismatches appearing in the first position (due to non templated addition) - if mismatch != 0: - char = mismatches[mismatch] - mismatch_pos = pos + mismatch - if mismatch_pos not in master_read_dict[tran]["seq"]: - master_read_dict[tran]["seq"][mismatch_pos] = {} - if char not in master_read_dict[tran]["seq"][mismatch_pos]: - master_read_dict[tran]["seq"][mismatch_pos][char] = 0 - master_read_dict[tran]["seq"][mismatch_pos][char] += 1 - # apply the modifiers - #pos = pos+pos_modifier - #readlen = readlen - readlen_modifier - - - try: - cds_start = transcriptome_info_dict[tran]["cds_start"] - cds_stop = transcriptome_info_dict[tran]["cds_stop"] - - if pos >= cds_start and pos <= cds_stop: - coding=True - except: - pass - - - if readlen in master_read_dict[tran]["unambig"]: - if pos in master_read_dict[tran]["unambig"][readlen]: - master_read_dict[tran]["unambig"][readlen][pos] += 1 - else: - master_read_dict[tran]["unambig"][readlen][pos] = 1 - else: - master_read_dict[tran]["unambig"][readlen] = {pos:1} - - if coding == True: - master_dict["unambiguous_coding_count"] += 1 - elif coding == False: - master_dict["unambiguous_non_coding_count"] += 1 - - else: - ambiguously_mapped_reads += 1 - for listname in process_chunk[read]: - tran = listname[0].replace("-","_").replace("(","").replace(")","") - if tran not in master_read_dict: - master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} - pos = int(listname[1]) - read_tags = listname[2] - nm_tag = 0 - for tag in read_tags: - if tag[0] == "NM": - nm_tag = int(tag[1]) - if nm_tag > 0: - md_tag = "" - for tag in read_tags: - if tag[0] == "MD": - md_tag = tag[1] - pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) - # apply the modifiers - #pos = pos+pos_modifier - #readlen = readlen - readlen_modifier - if readlen in master_read_dict[tran]["ambig"]: - if pos in master_read_dict[tran]["ambig"][readlen]: - master_read_dict[tran]["ambig"][readlen][pos] += 1 - else: - master_read_dict[tran]["ambig"][readlen][pos] = 1 - else: - master_read_dict[tran]["ambig"][readlen] = {pos:1} - return ambiguously_mapped_reads - - -def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq): - nucs = ["A","T","G","C"] - mismatches = {} - total_so_far = 0 - prev_char = "" - for char in md_tag: - if char in nucs: - if prev_char != "": - total_so_far += int(prev_char) - prev_char = "" - mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)]) - else: - if char != "^" and char != "N": - if prev_char == "": - prev_char = char - else: - total_so_far += int(prev_char+char) - prev_char = "" - readlen_modifier = 0 - pos_modifier = 0 - five_ok = False - three_ok = False - while five_ok == False: - for i in range(0,readlen): - if i in mismatches: - pos_modifier += 1 - readlen_modifier += 1 - else: - five_ok = True - break - five_ok = True - - - while three_ok == False: - for i in range(readlen-1,0,-1): - if i in mismatches: - readlen_modifier += 1 - else: - three_ok = True - break - three_ok = True - - - return (pos_modifier, readlen_modifier, mismatches) - - - -def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile): - desc = "NULL" - start_time = time.time() - study_dict ={} - nuc_count_dict = {"mapped":{},"unmapped":{}} - dinuc_count_dict = {} - threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}} - read_length_dict = {} - unambig_read_length_dict = {} - unmapped_dict = {} - master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()} - - transcriptome_info_dict = {} - connection = sqlite3.connect(transcriptome_info_dict_path) - cursor = connection.cursor() - cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;") - result = cursor.fetchall() - for row in result: - transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]} - #print list(transcriptome_info_dict)[:10] - - cursor.execute("SELECT * from exons;") - result = cursor.fetchall() - for row in result: - transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2])) - - #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers - multimappers = 0 - unmapped_reads = 0 - unambiguous_coding_count = 0 - unambiguous_non_coding_count = 0 - trip_periodicity_reads = 0 - - final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}} - master_read_dict = {} - prev_seq = "" - process_chunk = {"read_name":[["placeholder_tran","1","28"]]} - mapped_reads = 0 - ambiguously_mapped_reads = 0 - master_trip_dict = {"fiveprime":{}, "threeprime":{}} - master_offset_dict = {"fiveprime":{}, "threeprime":{}} - master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}} - - - os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam 2> /dev/null') - - pysam.set_verbosity(0) - infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") - header = infile.header["HD"] - - unsorted = False - if "SO" in header: - print("Sorting order: "+header["SO"]) - if header["SO"] != "queryname": - print("Sorting order is not queryname") - unsorted = True - else: - unsorted = True - if unsorted == True: - print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam") - print (header,bam_filepath) - sys.exit() - total_bam_lines = 0 - all_ref_ids = infile.references - - for read in infile.fetch(until_eof=True): - total_bam_lines += 1 - if not read.is_unmapped: - ref = read.reference_id - tran = (all_ref_ids[ref]).split(".")[0] - mapped_reads += 1 - if mapped_reads%1000000 == 0: - print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time))) - pos = read.reference_start - readname = read.query_name - read_tags = read.tags - if readname == list(process_chunk)[0]: - process_chunk[readname].append([tran,pos,read_tags]) - #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read - else: - if list(process_chunk)[0] != "read_name": - - #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because - # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) - seq = read.seq - readlen = len(seq) - - # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) - if readlen not in read_length_dict: - read_length_dict[readlen] = 0 - read_length_dict[readlen] += 1 - - if readlen not in nuc_count_dict["mapped"]: - nuc_count_dict["mapped"][readlen] = {} - if readlen not in threeprime_nuc_count_dict["mapped"]: - threeprime_nuc_count_dict["mapped"][readlen] = {} - if readlen not in dinuc_count_dict: - dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0, - "AT":0, "TT":0, "GT":0, "CT":0, - "AG":0, "TG":0, "GG":0, "CG":0, - "AC":0, "TC":0, "GC":0, "CC":0} - - for i in range(0,len(seq)): - if i not in nuc_count_dict["mapped"][readlen]: - nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} - nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 - - for i in range(0,len(seq)): - try: - dinuc_count_dict[readlen][seq[i:i+2]] += 1 - except: - pass - - for i in range(len(seq),0,-1): - dist = i-len(seq) - if dist not in threeprime_nuc_count_dict["mapped"][readlen]: - threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} - threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1 - ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict) - process_chunk = {readname:[[tran, pos, read_tags]]} - prev_seq = read.seq - else: - unmapped_reads += 1 - - # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. - seq = read.seq - readlen = len(seq) - if seq in unmapped_dict: - unmapped_dict[seq] += 1 - else: - unmapped_dict[seq] = 1 - - # Populate the nuc_count_dict with this unmapped read - if readlen not in nuc_count_dict["unmapped"]: - nuc_count_dict["unmapped"][readlen] = {} - for i in range(0,len(seq)): - if i not in nuc_count_dict["unmapped"][readlen]: - nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} - nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 - - if readlen not in threeprime_nuc_count_dict["unmapped"]: - threeprime_nuc_count_dict["unmapped"][readlen] = {} - - for i in range(len(seq),0,-1): - dist = i-len(seq) - if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: - threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} - threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 - - #add stats about mapped/unmapped reads to file dict which will be used for the final report - master_dict["total_bam_lines"] = total_bam_lines - master_dict["mapped_reads"] = mapped_reads - master_dict["unmapped_reads"] = unmapped_reads - master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads - - if "read_name" in master_read_dict: - del master_read_dict["read_name"] - print ("BAM file processed") - print ("Creating metagenes, triplet periodicity plots, etc.") - - for tran in master_read_dict: - try: - cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"]) - cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"]) - # print(tran, type(cds_start)) - except: - print("Exception: ", tran) - continue - - tranlen = transcriptome_info_dict[tran]["length"] - #Use this to discard transcripts with no 5' leader or 3' trailer - if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1: - for primetype in ["fiveprime", "threeprime"]: - # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads - for readlength in master_read_dict[tran]["unambig"]: - #print "readlength", readlength - # for each fiveprime postion for this readlength within this transcript - for raw_pos in master_read_dict[tran]["unambig"][readlength]: - #print "raw pos", raw_pos - trip_periodicity_reads += 1 - if primetype == "fiveprime": - # get the five prime postion minus the cds start postion - real_pos = raw_pos-cds_start - rel_stop_pos = raw_pos-cds_stop - elif primetype == "threeprime": - real_pos = (raw_pos+readlength)-cds_start - rel_stop_pos = (raw_pos+readlength)-cds_stop - #get the readcount at the raw postion - readcount = master_read_dict[tran]["unambig"][readlength][raw_pos] - #print "readcount", readcount - frame = (real_pos%3) - if real_pos >= cds_start and real_pos <= cds_stop: - if readlength in master_trip_dict[primetype]: - master_trip_dict[primetype][readlength][str(frame)] += readcount - else: - master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0} - master_trip_dict[primetype][readlength][str(frame)] += readcount - # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict - if real_pos > (-600) and real_pos < (601): - if readlength in master_offset_dict[primetype]: - if real_pos in master_offset_dict[primetype][readlength]: - #print "real pos in offset dict" - master_offset_dict[primetype][readlength][real_pos] += readcount - else: - #print "real pos not in offset dict" - master_offset_dict[primetype][readlength][real_pos] = readcount - else: - #initiliase with zero to avoid missing neighbours below - #print "initialising with zeros" - master_offset_dict[primetype][readlength]= {} - for i in range(-600,601): - master_offset_dict[primetype][readlength][i] = 0 - master_offset_dict[primetype][readlength][real_pos] += readcount - - # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict - if rel_stop_pos > (-600) and rel_stop_pos < (601): - if readlength in master_metagene_stop_dict[primetype]: - if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]: - master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount - else: - master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount - else: - #initiliase with zero to avoid missing neighbours below - master_metagene_stop_dict[primetype][readlength] = {} - for i in range(-600,601): - master_metagene_stop_dict[primetype][readlength][i] = 0 - master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount - - # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame - # create a 'score' for each readlength by putting the max frame count over the second highest frame count - for primetype in ["fiveprime", "threeprime"]: - for subreadlength in master_trip_dict[primetype]: - maxcount = 0 - secondmaxcount = 0 - for frame in master_trip_dict[primetype][subreadlength]: - if master_trip_dict[primetype][subreadlength][frame] > maxcount: - maxcount = master_trip_dict[primetype][subreadlength][frame] - for frame in master_trip_dict[primetype][subreadlength]: - if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount: - secondmaxcount = master_trip_dict[primetype][subreadlength][frame] - # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height - master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount) - #This part is to determine what offsets to give each read length - print ("Calculating offsets") - for primetype in ["fiveprime", "threeprime"]: - for readlen in master_offset_dict[primetype]: - accepted_len = False - max_relative_pos = 0 - max_relative_count = 0 - for relative_pos in master_offset_dict[primetype][readlen]: - # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) - if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10): - continue - if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count: - max_relative_pos = relative_pos - max_relative_count = master_offset_dict[primetype][readlen][relative_pos] - #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) - if primetype == "fiveprime": - # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall - final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2) - elif primetype == "threeprime": - # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall - final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2 - #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict - if readlen in master_trip_dict[primetype]: - final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"] - else: - final_offsets[primetype]["read_scores"][readlen] = 0.0 - - - master_read_dict["unmapped_reads"] = unmapped_reads - master_read_dict["offsets"] = final_offsets - master_read_dict["trip_periodicity"] = master_trip_dict - master_read_dict["desc"] = "Null" - master_read_dict["mapped_reads"] = mapped_reads - master_read_dict["nuc_counts"] = nuc_count_dict - master_read_dict["dinuc_counts"] = dinuc_count_dict - master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict - master_read_dict["metagene_counts"] = master_offset_dict - master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict - master_read_dict["read_lengths"] = read_length_dict - master_read_dict["unambig_read_lengths"] = unambig_read_length_dict - master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] - master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] - master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] - master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:] - master_read_dict["cutadapt_removed"] = 0 - master_read_dict["rrna_removed"] = 0 - #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line - master_read_dict["removed_minus_m"] = 0 - master_dict["removed_minus_m"] = 0 - # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets - master_read_dict["unambiguous_all_totals"] = {} - master_read_dict["unambiguous_fiveprime_totals"] = {} - master_read_dict["unambiguous_cds_totals"] = {} - master_read_dict["unambiguous_threeprime_totals"] = {} - - master_read_dict["ambiguous_all_totals"] = {} - master_read_dict["ambiguous_fiveprime_totals"] = {} - master_read_dict["ambiguous_cds_totals"] = {} - master_read_dict["ambiguous_threeprime_totals"] = {} - print ("calculating transcript counts") - for tran in master_read_dict: - if tran in transcriptome_info_dict: - five_total = 0 - cds_total = 0 - three_total = 0 - - ambig_five_total = 0 - ambig_cds_total = 0 - ambig_three_total = 0 - - cds_start = transcriptome_info_dict[tran]["cds_start"] - cds_stop = transcriptome_info_dict[tran]["cds_stop"] - - for readlen in master_read_dict[tran]["unambig"]: - if readlen in final_offsets["fiveprime"]["offsets"]: - offset = final_offsets["fiveprime"]["offsets"][readlen] - else: - offset = 15 - for pos in master_read_dict[tran]["unambig"][readlen]: - real_pos = pos+offset - if cds_start is None or cds_stop is None: - three_total += master_read_dict[tran]["unambig"][readlen][pos] - else: - if real_pos =cds_start and real_pos <= cds_stop: - cds_total += master_read_dict[tran]["unambig"][readlen][pos] - elif real_pos > cds_stop: - three_total += master_read_dict[tran]["unambig"][readlen][pos] - master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total - master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total - master_read_dict["unambiguous_cds_totals"][tran] = cds_total - master_read_dict["unambiguous_threeprime_totals"][tran] = three_total - - for readlen in master_read_dict[tran]["ambig"]: - if readlen in final_offsets["fiveprime"]["offsets"]: - offset = final_offsets["fiveprime"]["offsets"][readlen] - else: - offset = 15 - for pos in master_read_dict[tran]["ambig"][readlen]: - if cds_start is None or cds_stop is None: - ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] - else: - real_pos = pos+offset - if real_pos < cds_start: - ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos] - elif real_pos >=cds_start and real_pos <= cds_stop: - ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] - elif real_pos > cds_stop: - ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] - - master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total - master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total - master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total - master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total - - print ("Writing out to sqlite file") - sqlite_db = SqliteDict(outputfile,autocommit=False) - for key in master_read_dict: - sqlite_db[key] = master_read_dict[key] - sqlite_db["description"] = desc - sqlite_db.commit() - sqlite_db.close() - - -if __name__ == "__main__": - if len(sys.argv) <= 2: - print ("Usage: python bam_to_sqlite.py ") - sys.exit() - bam_filepath = sys.argv[1] - annotation_sqlite_filepath = sys.argv[2] - desc = sys.argv[3] - outputfile = sys.argv[4] - process_bam(bam_filepath,annotation_sqlite_filepath,outputfile) diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test.bam Binary file trips_bam_to_sqlite/test-data/test.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test.bamv2.sqlite Binary file trips_bam_to_sqlite/test-data/test.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_n_sorted.bam Binary file trips_bam_to_sqlite/test-data/test_n_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_n_sorted.bam_n_sorted.bam Binary file trips_bam_to_sqlite/test-data/test_n_sorted.bam_n_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_n_sorted.bamv2.sqlite Binary file trips_bam_to_sqlite/test-data/test_n_sorted.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_org.sqlite Binary file trips_bam_to_sqlite/test-data/test_org.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_organism.sqlite diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_sorted.bam Binary file trips_bam_to_sqlite/test-data/test_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/test-data/test_sorted.bamv2.sqlite Binary file trips_bam_to_sqlite/test-data/test_sorted.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite/trips_bam_to_sqlite.xml --- a/trips_bam_to_sqlite/trips_bam_to_sqlite.xml Wed Nov 02 15:23:28 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ - - Convert BAM file to SQLITE for Trips-Viz - - pysam - sqlitedict - sqlite - samtools - - - - - - - - - - - - - - - - - - - - - **What it does** - - Process your transcriptome read alignments for TRIPS-Viz - - Prerequisites: - - name-sorted bam file (samtools sort -n) - - TRIPS-Viz annotation file in SQLITE format. - - - diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/bam_to_sqlite.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_bam_to_sqlite_builtin/bam_to_sqlite.py Thu Nov 03 12:25:58 2022 +0000 @@ -0,0 +1,585 @@ +import sys +import pysam +import operator +import os +import time +import sqlite3 +from sqlitedict import SqliteDict + +def tran_to_genome(tran, pos, transcriptome_info_dict): + #print ("tran",list(transcriptome_info_dict)) + traninfo = transcriptome_info_dict[tran] + chrom = traninfo["chrom"] + strand = traninfo["strand"] + exons = sorted(traninfo["exons"]) + #print exons + if strand == "+": + exon_start = 0 + for tup in exons: + exon_start = tup[0] + exonlen = tup[1] - tup[0] + if pos > exonlen: + pos = (pos - exonlen)-1 + else: + break + genomic_pos = (exon_start+pos)-1 + elif strand == "-": + exon_start = 0 + for tup in exons[::-1]: + exon_start = tup[1] + exonlen = tup[1] - tup[0] + if pos > exonlen: + pos = (pos - exonlen)-1 + else: + break + genomic_pos = (exon_start-pos)+1 + return (chrom, genomic_pos) + + +# Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript +# This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict +def processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,readseq, unambig_read_length_dict): + readlen = len(readseq) + ambiguously_mapped_reads = 0 + #get the read name + read = list(process_chunk)[0] + + read_list = process_chunk[read] # a list of lists of all transcripts the read aligns to and the positions + #used to store different genomic poistions + genomic_positions = [] + + #This section is just to get the different genomic positions the read aligns to + + for listname in process_chunk[read]: + + tran = listname[0].replace("-","_").replace("(","").replace(")","") + + pos = int(listname[1]) + genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict) + #print ("genomic pos",genomic_pos) + if genomic_pos not in genomic_positions: + genomic_positions.append(genomic_pos) + + #If the read maps unambiguously + if len(genomic_positions) == 1: + if readlen not in unambig_read_length_dict: + unambig_read_length_dict[readlen] = 0 + unambig_read_length_dict[readlen] += 1 + #assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True + coding=False + + # For each transcript this read alings to + for listname in process_chunk[read]: + #get the transcript name + tran = listname[0].replace("-","_").replace("(","").replace(")","") + #If we haven't come across this transcript already then add to master_read_dict + if tran not in master_read_dict: + master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} + #get the raw unedited positon, and read tags + pos = int(listname[1]) + read_tags = listname[2] + #If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary + nm_tag = 0 + + for tag in read_tags: + if tag[0] == "NM": + nm_tag = int(tag[1]) + if nm_tag > 0: + md_tag = "" + for tag in read_tags: + if tag[0] == "MD": + md_tag = tag[1] + pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) + # Count the mismatches (we only do this for unambiguous) + for mismatch in mismatches: + #Ignore mismatches appearing in the first position (due to non templated addition) + if mismatch != 0: + char = mismatches[mismatch] + mismatch_pos = pos + mismatch + if mismatch_pos not in master_read_dict[tran]["seq"]: + master_read_dict[tran]["seq"][mismatch_pos] = {} + if char not in master_read_dict[tran]["seq"][mismatch_pos]: + master_read_dict[tran]["seq"][mismatch_pos][char] = 0 + master_read_dict[tran]["seq"][mismatch_pos][char] += 1 + # apply the modifiers + #pos = pos+pos_modifier + #readlen = readlen - readlen_modifier + + + try: + cds_start = transcriptome_info_dict[tran]["cds_start"] + cds_stop = transcriptome_info_dict[tran]["cds_stop"] + + if pos >= cds_start and pos <= cds_stop: + coding=True + except: + pass + + + if readlen in master_read_dict[tran]["unambig"]: + if pos in master_read_dict[tran]["unambig"][readlen]: + master_read_dict[tran]["unambig"][readlen][pos] += 1 + else: + master_read_dict[tran]["unambig"][readlen][pos] = 1 + else: + master_read_dict[tran]["unambig"][readlen] = {pos:1} + + if coding == True: + master_dict["unambiguous_coding_count"] += 1 + elif coding == False: + master_dict["unambiguous_non_coding_count"] += 1 + + else: + ambiguously_mapped_reads += 1 + for listname in process_chunk[read]: + tran = listname[0].replace("-","_").replace("(","").replace(")","") + if tran not in master_read_dict: + master_read_dict[tran] = {"ambig":{}, "unambig":{}, "mismatches":{}, "seq":{}} + pos = int(listname[1]) + read_tags = listname[2] + nm_tag = 0 + for tag in read_tags: + if tag[0] == "NM": + nm_tag = int(tag[1]) + if nm_tag > 0: + md_tag = "" + for tag in read_tags: + if tag[0] == "MD": + md_tag = tag[1] + pos_modifier, readlen_modifier,mismatches = get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq) + # apply the modifiers + #pos = pos+pos_modifier + #readlen = readlen - readlen_modifier + if readlen in master_read_dict[tran]["ambig"]: + if pos in master_read_dict[tran]["ambig"][readlen]: + master_read_dict[tran]["ambig"][readlen][pos] += 1 + else: + master_read_dict[tran]["ambig"][readlen][pos] = 1 + else: + master_read_dict[tran]["ambig"][readlen] = {pos:1} + return ambiguously_mapped_reads + + +def get_mismatch_pos(md_tag,pos,readlen,master_read_dict,tran,readseq): + nucs = ["A","T","G","C"] + mismatches = {} + total_so_far = 0 + prev_char = "" + for char in md_tag: + if char in nucs: + if prev_char != "": + total_so_far += int(prev_char) + prev_char = "" + mismatches[total_so_far+len(mismatches)] = (readseq[total_so_far+len(mismatches)]) + else: + if char != "^" and char != "N": + if prev_char == "": + prev_char = char + else: + total_so_far += int(prev_char+char) + prev_char = "" + readlen_modifier = 0 + pos_modifier = 0 + five_ok = False + three_ok = False + while five_ok == False: + for i in range(0,readlen): + if i in mismatches: + pos_modifier += 1 + readlen_modifier += 1 + else: + five_ok = True + break + five_ok = True + + + while three_ok == False: + for i in range(readlen-1,0,-1): + if i in mismatches: + readlen_modifier += 1 + else: + three_ok = True + break + three_ok = True + + + return (pos_modifier, readlen_modifier, mismatches) + + + +def process_bam(bam_filepath, transcriptome_info_dict_path,outputfile): + desc = "NULL" + start_time = time.time() + study_dict ={} + nuc_count_dict = {"mapped":{},"unmapped":{}} + dinuc_count_dict = {} + threeprime_nuc_count_dict = {"mapped":{},"unmapped":{}} + read_length_dict = {} + unambig_read_length_dict = {} + unmapped_dict = {} + master_dict = {"unambiguous_non_coding_count":0,"unambiguous_coding_count":0,"current_dir":os.getcwd()} + + transcriptome_info_dict = {} + connection = sqlite3.connect(transcriptome_info_dict_path) + cursor = connection.cursor() + cursor.execute("SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;") + result = cursor.fetchall() + for row in result: + transcriptome_info_dict[str(row[0])] = {"cds_start":row[1],"cds_stop":row[2],"length":row[3],"strand":row[4],"chrom":row[5],"exons":[],"tran_type":row[6]} + #print list(transcriptome_info_dict)[:10] + + cursor.execute("SELECT * from exons;") + result = cursor.fetchall() + for row in result: + transcriptome_info_dict[str(row[0])]["exons"].append((row[1],row[2])) + + #it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers + multimappers = 0 + unmapped_reads = 0 + unambiguous_coding_count = 0 + unambiguous_non_coding_count = 0 + trip_periodicity_reads = 0 + + final_offsets = {"fiveprime":{"offsets":{}, "read_scores":{}}, "threeprime":{"offsets":{}, "read_scores":{}}} + master_read_dict = {} + prev_seq = "" + process_chunk = {"read_name":[["placeholder_tran","1","28"]]} + mapped_reads = 0 + ambiguously_mapped_reads = 0 + master_trip_dict = {"fiveprime":{}, "threeprime":{}} + master_offset_dict = {"fiveprime":{}, "threeprime":{}} + master_metagene_stop_dict = {"fiveprime":{}, "threeprime":{}} + + + os.system(f'samtools sort -n {bam_filepath} -o {bam_filepath}_n_sorted.bam') + + pysam.set_verbosity(0) + infile = pysam.Samfile(f"{bam_filepath}_n_sorted.bam", "rb") + header = infile.header["HD"] + + unsorted = False + if "SO" in header: + print("Sorting order: "+header["SO"]) + if header["SO"] != "queryname": + print("Sorting order is not queryname") + unsorted = True + else: + unsorted = True + if unsorted == True: + print ("ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam") + print (header,bam_filepath) + sys.exit() + total_bam_lines = 0 + all_ref_ids = infile.references + + for read in infile.fetch(until_eof=True): + total_bam_lines += 1 + if not read.is_unmapped: + ref = read.reference_id + tran = (all_ref_ids[ref]).split(".")[0] + mapped_reads += 1 + if mapped_reads%1000000 == 0: + print ("{} reads parsed at {}".format(mapped_reads,(time.time()-start_time))) + pos = read.reference_start + readname = read.query_name + read_tags = read.tags + if readname == list(process_chunk)[0]: + process_chunk[readname].append([tran,pos,read_tags]) + #if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read + else: + if list(process_chunk)[0] != "read_name": + + #At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because + # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive) + seq = read.seq + readlen = len(seq) + + # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps) + if readlen not in read_length_dict: + read_length_dict[readlen] = 0 + read_length_dict[readlen] += 1 + + if readlen not in nuc_count_dict["mapped"]: + nuc_count_dict["mapped"][readlen] = {} + if readlen not in threeprime_nuc_count_dict["mapped"]: + threeprime_nuc_count_dict["mapped"][readlen] = {} + if readlen not in dinuc_count_dict: + dinuc_count_dict[readlen] = {"AA":0, "TA":0, "GA":0, "CA":0, + "AT":0, "TT":0, "GT":0, "CT":0, + "AG":0, "TG":0, "GG":0, "CG":0, + "AC":0, "TC":0, "GC":0, "CC":0} + + for i in range(0,len(seq)): + if i not in nuc_count_dict["mapped"][readlen]: + nuc_count_dict["mapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} + nuc_count_dict["mapped"][readlen][i][seq[i]] += 1 + + for i in range(0,len(seq)): + try: + dinuc_count_dict[readlen][seq[i:i+2]] += 1 + except: + pass + + for i in range(len(seq),0,-1): + dist = i-len(seq) + if dist not in threeprime_nuc_count_dict["mapped"][readlen]: + threeprime_nuc_count_dict["mapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} + threeprime_nuc_count_dict["mapped"][readlen][dist][seq[dist]] += 1 + ambiguously_mapped_reads += processor(process_chunk, master_read_dict, transcriptome_info_dict,master_dict,prev_seq, unambig_read_length_dict) + process_chunk = {readname:[[tran, pos, read_tags]]} + prev_seq = read.seq + else: + unmapped_reads += 1 + + # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is. + seq = read.seq + readlen = len(seq) + if seq in unmapped_dict: + unmapped_dict[seq] += 1 + else: + unmapped_dict[seq] = 1 + + # Populate the nuc_count_dict with this unmapped read + if readlen not in nuc_count_dict["unmapped"]: + nuc_count_dict["unmapped"][readlen] = {} + for i in range(0,len(seq)): + if i not in nuc_count_dict["unmapped"][readlen]: + nuc_count_dict["unmapped"][readlen][i] = {"A":0, "T":0, "G":0, "C":0, "N":0} + nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1 + + if readlen not in threeprime_nuc_count_dict["unmapped"]: + threeprime_nuc_count_dict["unmapped"][readlen] = {} + + for i in range(len(seq),0,-1): + dist = i-len(seq) + if dist not in threeprime_nuc_count_dict["unmapped"][readlen]: + threeprime_nuc_count_dict["unmapped"][readlen][dist] = {"A":0, "T":0, "G":0, "C":0, "N":0} + threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1 + + #add stats about mapped/unmapped reads to file dict which will be used for the final report + master_dict["total_bam_lines"] = total_bam_lines + master_dict["mapped_reads"] = mapped_reads + master_dict["unmapped_reads"] = unmapped_reads + master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads + + if "read_name" in master_read_dict: + del master_read_dict["read_name"] + print ("BAM file processed") + print ("Creating metagenes, triplet periodicity plots, etc.") + + for tran in master_read_dict: + try: + cds_start = int(0 if transcriptome_info_dict[tran]["cds_start"] is None else transcriptome_info_dict[tran]["cds_start"]) + cds_stop = int(0 if transcriptome_info_dict[tran]["cds_stop"] is None else transcriptome_info_dict[tran]["cds_stop"]) + # print(tran, type(cds_start)) + except: + print("Exception: ", tran) + continue + + tranlen = transcriptome_info_dict[tran]["length"] + #Use this to discard transcripts with no 5' leader or 3' trailer + if cds_start > 1 and cds_stop < tranlen and transcriptome_info_dict[tran]["tran_type"] == 1: + for primetype in ["fiveprime", "threeprime"]: + # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads + for readlength in master_read_dict[tran]["unambig"]: + #print "readlength", readlength + # for each fiveprime postion for this readlength within this transcript + for raw_pos in master_read_dict[tran]["unambig"][readlength]: + #print "raw pos", raw_pos + trip_periodicity_reads += 1 + if primetype == "fiveprime": + # get the five prime postion minus the cds start postion + real_pos = raw_pos-cds_start + rel_stop_pos = raw_pos-cds_stop + elif primetype == "threeprime": + real_pos = (raw_pos+readlength)-cds_start + rel_stop_pos = (raw_pos+readlength)-cds_stop + #get the readcount at the raw postion + readcount = master_read_dict[tran]["unambig"][readlength][raw_pos] + #print "readcount", readcount + frame = (real_pos%3) + if real_pos >= cds_start and real_pos <= cds_stop: + if readlength in master_trip_dict[primetype]: + master_trip_dict[primetype][readlength][str(frame)] += readcount + else: + master_trip_dict[primetype][readlength]= {"0":0.0,"1":0.0,"2":0.0} + master_trip_dict[primetype][readlength][str(frame)] += readcount + # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict + if real_pos > (-600) and real_pos < (601): + if readlength in master_offset_dict[primetype]: + if real_pos in master_offset_dict[primetype][readlength]: + #print "real pos in offset dict" + master_offset_dict[primetype][readlength][real_pos] += readcount + else: + #print "real pos not in offset dict" + master_offset_dict[primetype][readlength][real_pos] = readcount + else: + #initiliase with zero to avoid missing neighbours below + #print "initialising with zeros" + master_offset_dict[primetype][readlength]= {} + for i in range(-600,601): + master_offset_dict[primetype][readlength][i] = 0 + master_offset_dict[primetype][readlength][real_pos] += readcount + + # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict + if rel_stop_pos > (-600) and rel_stop_pos < (601): + if readlength in master_metagene_stop_dict[primetype]: + if rel_stop_pos in master_metagene_stop_dict[primetype][readlength]: + master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount + else: + master_metagene_stop_dict[primetype][readlength][rel_stop_pos] = readcount + else: + #initiliase with zero to avoid missing neighbours below + master_metagene_stop_dict[primetype][readlength] = {} + for i in range(-600,601): + master_metagene_stop_dict[primetype][readlength][i] = 0 + master_metagene_stop_dict[primetype][readlength][rel_stop_pos] += readcount + + # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame + # create a 'score' for each readlength by putting the max frame count over the second highest frame count + for primetype in ["fiveprime", "threeprime"]: + for subreadlength in master_trip_dict[primetype]: + maxcount = 0 + secondmaxcount = 0 + for frame in master_trip_dict[primetype][subreadlength]: + if master_trip_dict[primetype][subreadlength][frame] > maxcount: + maxcount = master_trip_dict[primetype][subreadlength][frame] + for frame in master_trip_dict[primetype][subreadlength]: + if master_trip_dict[primetype][subreadlength][frame] > secondmaxcount and master_trip_dict[primetype][subreadlength][frame] != maxcount: + secondmaxcount = master_trip_dict[primetype][subreadlength][frame] + # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height + master_trip_dict[primetype][subreadlength]["score"] = float(secondmaxcount)/float(maxcount) + #This part is to determine what offsets to give each read length + print ("Calculating offsets") + for primetype in ["fiveprime", "threeprime"]: + for readlen in master_offset_dict[primetype]: + accepted_len = False + max_relative_pos = 0 + max_relative_count = 0 + for relative_pos in master_offset_dict[primetype][readlen]: + # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream) + if abs(relative_pos) < 10 or abs(relative_pos) > (readlen-10): + continue + if master_offset_dict[primetype][readlen][relative_pos] > max_relative_count: + max_relative_pos = relative_pos + max_relative_count = master_offset_dict[primetype][readlen][relative_pos] + #print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos) + if primetype == "fiveprime": + # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall + final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos-2) + elif primetype == "threeprime": + # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall + final_offsets[primetype]["offsets"][readlen] = (max_relative_pos*(-1))+2 + #If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict + if readlen in master_trip_dict[primetype]: + final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[primetype][readlen]["score"] + else: + final_offsets[primetype]["read_scores"][readlen] = 0.0 + + + master_read_dict["unmapped_reads"] = unmapped_reads + master_read_dict["offsets"] = final_offsets + master_read_dict["trip_periodicity"] = master_trip_dict + master_read_dict["desc"] = "Null" + master_read_dict["mapped_reads"] = mapped_reads + master_read_dict["nuc_counts"] = nuc_count_dict + master_read_dict["dinuc_counts"] = dinuc_count_dict + master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict + master_read_dict["metagene_counts"] = master_offset_dict + master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict + master_read_dict["read_lengths"] = read_length_dict + master_read_dict["unambig_read_lengths"] = unambig_read_length_dict + master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"] + master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"] + master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"] + master_read_dict["frequent_unmapped_reads"] = (sorted(unmapped_dict.items(), key=operator.itemgetter(1)))[-2000:] + master_read_dict["cutadapt_removed"] = 0 + master_read_dict["rrna_removed"] = 0 + #If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line + master_read_dict["removed_minus_m"] = 0 + master_dict["removed_minus_m"] = 0 + # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets + master_read_dict["unambiguous_all_totals"] = {} + master_read_dict["unambiguous_fiveprime_totals"] = {} + master_read_dict["unambiguous_cds_totals"] = {} + master_read_dict["unambiguous_threeprime_totals"] = {} + + master_read_dict["ambiguous_all_totals"] = {} + master_read_dict["ambiguous_fiveprime_totals"] = {} + master_read_dict["ambiguous_cds_totals"] = {} + master_read_dict["ambiguous_threeprime_totals"] = {} + print ("calculating transcript counts") + for tran in master_read_dict: + if tran in transcriptome_info_dict: + five_total = 0 + cds_total = 0 + three_total = 0 + + ambig_five_total = 0 + ambig_cds_total = 0 + ambig_three_total = 0 + + cds_start = transcriptome_info_dict[tran]["cds_start"] + cds_stop = transcriptome_info_dict[tran]["cds_stop"] + + for readlen in master_read_dict[tran]["unambig"]: + if readlen in final_offsets["fiveprime"]["offsets"]: + offset = final_offsets["fiveprime"]["offsets"][readlen] + else: + offset = 15 + for pos in master_read_dict[tran]["unambig"][readlen]: + real_pos = pos+offset + if cds_start is None or cds_stop is None: + three_total += master_read_dict[tran]["unambig"][readlen][pos] + else: + if real_pos =cds_start and real_pos <= cds_stop: + cds_total += master_read_dict[tran]["unambig"][readlen][pos] + elif real_pos > cds_stop: + three_total += master_read_dict[tran]["unambig"][readlen][pos] + master_read_dict["unambiguous_all_totals"][tran] = five_total+cds_total+three_total + master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total + master_read_dict["unambiguous_cds_totals"][tran] = cds_total + master_read_dict["unambiguous_threeprime_totals"][tran] = three_total + + for readlen in master_read_dict[tran]["ambig"]: + if readlen in final_offsets["fiveprime"]["offsets"]: + offset = final_offsets["fiveprime"]["offsets"][readlen] + else: + offset = 15 + for pos in master_read_dict[tran]["ambig"][readlen]: + if cds_start is None or cds_stop is None: + ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] + else: + real_pos = pos+offset + if real_pos < cds_start: + ambig_five_total += master_read_dict[tran]["ambig"][readlen][pos] + elif real_pos >=cds_start and real_pos <= cds_stop: + ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos] + elif real_pos > cds_stop: + ambig_three_total += master_read_dict[tran]["ambig"][readlen][pos] + + master_read_dict["ambiguous_all_totals"][tran] = five_total+cds_total+three_total+ambig_five_total+ambig_cds_total+ambig_three_total + master_read_dict["ambiguous_fiveprime_totals"][tran] = five_total+ambig_five_total + master_read_dict["ambiguous_cds_totals"][tran] = cds_total+ambig_cds_total + master_read_dict["ambiguous_threeprime_totals"][tran] = three_total+ambig_three_total + + print ("Writing out to sqlite file") + sqlite_db = SqliteDict(outputfile,autocommit=False) + for key in master_read_dict: + sqlite_db[key] = master_read_dict[key] + sqlite_db["description"] = desc + sqlite_db.commit() + sqlite_db.close() + + +if __name__ == "__main__": + if len(sys.argv) <= 2: + print ("Usage: python bam_to_sqlite.py ") + sys.exit() + bam_filepath = sys.argv[1] + annotation_sqlite_filepath = sys.argv[2] + desc = sys.argv[3] + outputfile = sys.argv[4] + process_bam(bam_filepath,annotation_sqlite_filepath,outputfile) diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test.bam Binary file trips_bam_to_sqlite_builtin/test-data/test.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test.bamv2.sqlite Binary file trips_bam_to_sqlite_builtin/test-data/test.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bam Binary file trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bam_n_sorted.bam Binary file trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bam_n_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bamv2.sqlite Binary file trips_bam_to_sqlite_builtin/test-data/test_n_sorted.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_org.sqlite Binary file trips_bam_to_sqlite_builtin/test-data/test_org.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_organism.sqlite diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_sorted.bam Binary file trips_bam_to_sqlite_builtin/test-data/test_sorted.bam has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test-data/test_sorted.bamv2.sqlite Binary file trips_bam_to_sqlite_builtin/test-data/test_sorted.bamv2.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/test_out.sqlite Binary file trips_bam_to_sqlite_builtin/test_out.sqlite has changed diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/tool-data/builtin_sqlites.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_bam_to_sqlite_builtin/tool-data/builtin_sqlites.loc Thu Nov 03 12:25:58 2022 +0000 @@ -0,0 +1,8 @@ +mus_musculus_m28 m28 Mus musculus (m28) Genome /mnt/data/indices/bowtie/genome/mus_musculus_m28/mus_musculus_m28.fa +drosophila_melanogaster_dm6 dm6 Drosophila melanogaster (dm6) Genome /mnt/data/indices/bowtie/genome/drosophila_melanogater_dm6/dm6.fa +sacCer_9 sacCer9 Saccharomyces cerevisiae (v9) Genome /mnt/data/indices/bowtie/genome/sacCer_9/S288C_reference_sequence_R9-1-1_19990210.fsa +sarCer_3 sacCer3 Saccharomyces cerevisiae (v3) Genome /mnt/data/indices/bowtie/genome/sacCer_3/S288C_reference_sequence_R3-1-1_19970730.fsa +E_coli_K12_MG1655 e_coli Escherichia coli (K12 MG1655) Genome /mnt/data/indices/bowtie/genome/E_coli_K12_MG1655/GCF_003368535.1_ASM336853v1_genomic.fna +Arabidopsis araThal Arabidopsis thaliana (TAIR10) Genome /mnt/data/indices/bowtie/genome/Arabidopsis/GCF_000001735.4_TAIR10.1_genomic.fna +homo_sapiens_hg19 hg19 Homo sapiens (hg19) Genome /mnt/data/indices/bowtie/genome/homo_sapiens_hg19/hg19.fa +homo_sapiens_hg38 hg38 Homo sapiens (hg38) Genome /mnt/data/indices/bowtie/genome/homo_sapiens_hg38/hg38.fa \ No newline at end of file diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/tool-data/builtin_sqlites.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_bam_to_sqlite_builtin/tool-data/builtin_sqlites.loc.sample Thu Nov 03 12:25:58 2022 +0000 @@ -0,0 +1,1 @@ +phiX_ID phiX174 User-friendly phiX174 Description /path/to/phiX \ No newline at end of file diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_bam_to_sqlite_builtin/tool_data_table_conf.xml.sample Thu Nov 03 12:25:58 2022 +0000 @@ -0,0 +1,8 @@ + + + + + value, dbkey, name, path + +
+
\ No newline at end of file diff -r 02874b1b2015 -r df4bb52b226d trips_bam_to_sqlite_builtin/trips_bam_to_sqlite.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trips_bam_to_sqlite_builtin/trips_bam_to_sqlite.xml Thu Nov 03 12:25:58 2022 +0000 @@ -0,0 +1,62 @@ + + Convert BAM file to SQLITE for Trips-Viz + + pysam + sqlitedict + sqlite + samtools + + + + #if $refGenomeSource.genomeSource == "builtin": + python $__tool_directory__/bam_to_sqlite.py $input1 ${refGenomeSource.input2_builtin.fields.path} $input3 $output1 + #else: + python $__tool_directory__/bam_to_sqlite.py $input1 ${refGenomeSource.input2_file} $input3 $output1 + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + **What it does** + + Process your transcriptome read alignments for TRIPS-Viz + + Prerequisites: + - name-sorted bam file (samtools sort -n) + - TRIPS-Viz annotation file in SQLITE format. + + + +