changeset 0:fef356fa1802 draft

Uploaded
author jackcurragh
date Mon, 04 Apr 2022 09:48:32 +0000
parents
children 3ac12b611d7f
files trips_bam_to_sqlite/bam_to_sqlite.py trips_bam_to_sqlite/trips_bam_to_sqlite.xml
diffstat 2 files changed, 765 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trips_bam_to_sqlite/bam_to_sqlite.py	Mon Apr 04 09:48:32 2022 +0000
@@ -0,0 +1,728 @@
+import sys
+import pysam
+import operator
+import os
+import time
+import sqlite3
+from sqlitedict import SqliteDict
+
+
+def tran_to_genome(tran, pos, transcriptome_info_dict):
+    # print ("tran",list(transcriptome_info_dict))
+    traninfo = transcriptome_info_dict[tran]
+    chrom = traninfo["chrom"]
+    strand = traninfo["strand"]
+    exons = sorted(traninfo["exons"])
+    # print exons
+    if strand == "+":
+        exon_start = 0
+        for tup in exons:
+            exon_start = tup[0]
+            exonlen = tup[1] - tup[0]
+            if pos > exonlen:
+                pos = (pos - exonlen) - 1
+            else:
+                break
+        genomic_pos = (exon_start + pos) - 1
+    elif strand == "-":
+        exon_start = 0
+        for tup in exons[::-1]:
+            exon_start = tup[1]
+            exonlen = tup[1] - tup[0]
+            if pos > exonlen:
+                pos = (pos - exonlen) - 1
+            else:
+                break
+        genomic_pos = (exon_start - pos) + 1
+    return (chrom, genomic_pos)
+
+
+#  Takes a dictionary with a readname as key and a list of lists as value, each sub list has consists of two elements a transcript and the position the read aligns to in the transcript
+#  This function will count the number of genes that the transcripts correspond to and if less than or equal to 3 will add the relevant value to transcript_counts_dict
+def processor(
+    process_chunk,
+    master_read_dict,
+    transcriptome_info_dict,
+    master_dict,
+    readseq,
+    unambig_read_length_dict,
+):
+    readlen = len(readseq)
+    ambiguously_mapped_reads = 0
+    # get the read name
+    read = list(process_chunk)[0]
+
+    read_list = process_chunk[
+        read
+    ]  # a list of lists of all transcripts the read aligns to and the positions
+    # used to store different genomic poistions
+    genomic_positions = []
+
+    # This section is just to get the different genomic positions the read aligns to
+
+    for listname in process_chunk[read]:
+
+        tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
+
+        pos = int(listname[1])
+        genomic_pos = tran_to_genome(tran, pos, transcriptome_info_dict)
+        # print ("genomic pos",genomic_pos)
+        if genomic_pos not in genomic_positions:
+            genomic_positions.append(genomic_pos)
+
+    # If the read maps unambiguously
+    if len(genomic_positions) == 1:
+        if readlen not in unambig_read_length_dict:
+            unambig_read_length_dict[readlen] = 0
+        unambig_read_length_dict[readlen] += 1
+        # assume this read aligns to a noncoding position, if we find that it does align to a coding region change this to True
+        coding = False
+
+        # For each transcript this read alings to
+        for listname in process_chunk[read]:
+            # get the transcript name
+            tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
+            # If we haven't come across this transcript already then add to master_read_dict
+            if tran not in master_read_dict:
+                master_read_dict[tran] = {
+                    "ambig": {},
+                    "unambig": {},
+                    "mismatches": {},
+                    "seq": {},
+                }
+            # get the raw unedited positon, and read tags
+            pos = int(listname[1])
+            read_tags = listname[2]
+            # If there is mismatches in this line, then modify the postion and readlen (if mismatches at start or end) and add mismatches to dictionary
+            nm_tag = 0
+
+            for tag in read_tags:
+                if tag[0] == "NM":
+                    nm_tag = int(tag[1])
+            if nm_tag > 0:
+                md_tag = ""
+                for tag in read_tags:
+                    if tag[0] == "MD":
+                        md_tag = tag[1]
+                pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
+                    md_tag, pos, readlen, master_read_dict, tran, readseq
+                )
+                # Count the mismatches (we only do this for unambiguous)
+                for mismatch in mismatches:
+                    # Ignore mismatches appearing in the first position (due to non templated addition)
+                    if mismatch != 0:
+                        char = mismatches[mismatch]
+                        mismatch_pos = pos + mismatch
+                        if mismatch_pos not in master_read_dict[tran]["seq"]:
+                            master_read_dict[tran]["seq"][mismatch_pos] = {}
+                        if char not in master_read_dict[tran]["seq"][mismatch_pos]:
+                            master_read_dict[tran]["seq"][mismatch_pos][char] = 0
+                        master_read_dict[tran]["seq"][mismatch_pos][char] += 1
+                # apply the modifiers
+                # pos = pos+pos_modifier
+                # readlen = readlen - readlen_modifier
+
+            try:
+                cds_start = transcriptome_info_dict[tran]["cds_start"]
+                cds_stop = transcriptome_info_dict[tran]["cds_stop"]
+
+                if pos >= cds_start and pos <= cds_stop:
+                    coding = True
+            except:
+                pass
+
+            if readlen in master_read_dict[tran]["unambig"]:
+                if pos in master_read_dict[tran]["unambig"][readlen]:
+                    master_read_dict[tran]["unambig"][readlen][pos] += 1
+                else:
+                    master_read_dict[tran]["unambig"][readlen][pos] = 1
+            else:
+                master_read_dict[tran]["unambig"][readlen] = {pos: 1}
+
+        if coding == True:
+            master_dict["unambiguous_coding_count"] += 1
+        elif coding == False:
+            master_dict["unambiguous_non_coding_count"] += 1
+
+    else:
+        ambiguously_mapped_reads += 1
+        for listname in process_chunk[read]:
+            tran = listname[0].replace("-", "_").replace("(", "").replace(")", "")
+            if tran not in master_read_dict:
+                master_read_dict[tran] = {
+                    "ambig": {},
+                    "unambig": {},
+                    "mismatches": {},
+                    "seq": {},
+                }
+            pos = int(listname[1])
+            read_tags = listname[2]
+            nm_tag = 0
+            for tag in read_tags:
+                if tag[0] == "NM":
+                    nm_tag = int(tag[1])
+            if nm_tag > 0:
+                md_tag = ""
+                for tag in read_tags:
+                    if tag[0] == "MD":
+                        md_tag = tag[1]
+                    pos_modifier, readlen_modifier, mismatches = get_mismatch_pos(
+                        md_tag, pos, readlen, master_read_dict, tran, readseq
+                    )
+                    # apply the modifiers
+                    # pos = pos+pos_modifier
+                    # readlen = readlen - readlen_modifier
+                if readlen in master_read_dict[tran]["ambig"]:
+                    if pos in master_read_dict[tran]["ambig"][readlen]:
+                        master_read_dict[tran]["ambig"][readlen][pos] += 1
+                    else:
+                        master_read_dict[tran]["ambig"][readlen][pos] = 1
+                else:
+                    master_read_dict[tran]["ambig"][readlen] = {pos: 1}
+    return ambiguously_mapped_reads
+
+
+def get_mismatch_pos(md_tag, pos, readlen, master_read_dict, tran, readseq):
+    nucs = ["A", "T", "G", "C"]
+    mismatches = {}
+    total_so_far = 0
+    prev_char = ""
+    for char in md_tag:
+        if char in nucs:
+            if prev_char != "":
+                total_so_far += int(prev_char)
+                prev_char = ""
+            mismatches[total_so_far + len(mismatches)] = readseq[
+                total_so_far + len(mismatches)
+            ]
+        else:
+            if char != "^" and char != "N":
+                if prev_char == "":
+                    prev_char = char
+                else:
+                    total_so_far += int(prev_char + char)
+                    prev_char = ""
+    readlen_modifier = 0
+    pos_modifier = 0
+    five_ok = False
+    three_ok = False
+    while five_ok == False:
+        for i in range(0, readlen):
+            if i in mismatches:
+                pos_modifier += 1
+                readlen_modifier += 1
+            else:
+                five_ok = True
+                break
+        five_ok = True
+
+    while three_ok == False:
+        for i in range(readlen - 1, 0, -1):
+            if i in mismatches:
+                readlen_modifier += 1
+            else:
+                three_ok = True
+                break
+        three_ok = True
+
+    return (pos_modifier, readlen_modifier, mismatches)
+
+
+def process_bam(bam_filepath, transcriptome_info_dict_path, outputfile):
+    desc = "NULL"
+    start_time = time.time()
+    study_dict = {}
+    nuc_count_dict = {"mapped": {}, "unmapped": {}}
+    dinuc_count_dict = {}
+    threeprime_nuc_count_dict = {"mapped": {}, "unmapped": {}}
+    read_length_dict = {}
+    unambig_read_length_dict = {}
+    unmapped_dict = {}
+    master_dict = {
+        "unambiguous_non_coding_count": 0,
+        "unambiguous_coding_count": 0,
+        "current_dir": os.getcwd(),
+    }
+
+    transcriptome_info_dict = {}
+    connection = sqlite3.connect(transcriptome_info_dict_path)
+    cursor = connection.cursor()
+    cursor.execute(
+        "SELECT transcript,cds_start,cds_stop,length,strand,chrom,tran_type from transcripts;"
+    )
+    result = cursor.fetchall()
+    for row in result:
+        transcriptome_info_dict[str(row[0])] = {
+            "cds_start": row[1],
+            "cds_stop": row[2],
+            "length": row[3],
+            "strand": row[4],
+            "chrom": row[5],
+            "exons": [],
+            "tran_type": row[6],
+        }
+    # print list(transcriptome_info_dict)[:10]
+
+    cursor.execute("SELECT * from exons;")
+    result = cursor.fetchall()
+    for row in result:
+        transcriptome_info_dict[str(row[0])]["exons"].append((row[1], row[2]))
+
+    # it might be the case that there are no multimappers, so set this to 0 first to avoid an error, it will be overwritten later if there is multimappers
+    multimappers = 0
+    unmapped_reads = 0
+    unambiguous_coding_count = 0
+    unambiguous_non_coding_count = 0
+    trip_periodicity_reads = 0
+
+    final_offsets = {
+        "fiveprime": {"offsets": {}, "read_scores": {}},
+        "threeprime": {"offsets": {}, "read_scores": {}},
+    }
+    master_read_dict = {}
+    prev_seq = ""
+    process_chunk = {"read_name": [["placeholder_tran", "1", "28"]]}
+    mapped_reads = 0
+    ambiguously_mapped_reads = 0
+    master_trip_dict = {"fiveprime": {}, "threeprime": {}}
+    master_offset_dict = {"fiveprime": {}, "threeprime": {}}
+    master_metagene_stop_dict = {"fiveprime": {}, "threeprime": {}}
+
+    infile = pysam.Samfile(bam_filepath, "rb")
+    header = infile.header["HD"]
+    unsorted = False
+    if "SO" in header:
+        if header["SO"] != "queryname":
+            unsorted = True
+    else:
+        unsorted = True
+    if unsorted == True:
+        print(
+            "ERROR: Bam file appears to be unsorted or not sorted by read name. To sort by read name use the command: samtools sort -n input.bam output.bam"
+        )
+        print(header, bam_filepath)
+        sys.exit()
+    total_bam_lines = 0
+    all_ref_ids = infile.references
+
+    for read in infile.fetch(until_eof=True):
+        total_bam_lines += 1
+        if not read.is_unmapped:
+            ref = read.reference_id
+            tran = (all_ref_ids[ref]).split(".")[0]
+            mapped_reads += 1
+            if mapped_reads % 1000000 == 0:
+                print(
+                    "{} reads parsed at {}".format(
+                        mapped_reads, (time.time() - start_time)
+                    )
+                )
+            pos = read.reference_start
+            readname = read.query_name
+            read_tags = read.tags
+            if readname == list(process_chunk)[0]:
+                process_chunk[readname].append([tran, pos, read_tags])
+            # if the current read is different from previous reads send 'process_chunk' to the 'processor' function, then start 'process_chunk' over using current read
+            else:
+                if list(process_chunk)[0] != "read_name":
+
+                    # At this point we work out readseq, we do this for multiple reasons, firstly so we don't count the sequence from a read multiple times, just because
+                    # it aligns multiple times and secondly we only call read.seq once (read.seq is computationally expensive)
+                    seq = read.seq
+                    readlen = len(seq)
+
+                    # Note if a read maps ambiguously it will still be counted toward the read length distribution (however it will only be counted once, not each time it maps)
+                    if readlen not in read_length_dict:
+                        read_length_dict[readlen] = 0
+                    read_length_dict[readlen] += 1
+
+                    if readlen not in nuc_count_dict["mapped"]:
+                        nuc_count_dict["mapped"][readlen] = {}
+                    if readlen not in threeprime_nuc_count_dict["mapped"]:
+                        threeprime_nuc_count_dict["mapped"][readlen] = {}
+                    if readlen not in dinuc_count_dict:
+                        dinuc_count_dict[readlen] = {
+                            "AA": 0,
+                            "TA": 0,
+                            "GA": 0,
+                            "CA": 0,
+                            "AT": 0,
+                            "TT": 0,
+                            "GT": 0,
+                            "CT": 0,
+                            "AG": 0,
+                            "TG": 0,
+                            "GG": 0,
+                            "CG": 0,
+                            "AC": 0,
+                            "TC": 0,
+                            "GC": 0,
+                            "CC": 0,
+                        }
+
+                    for i in range(0, len(seq)):
+                        if i not in nuc_count_dict["mapped"][readlen]:
+                            nuc_count_dict["mapped"][readlen][i] = {
+                                "A": 0,
+                                "T": 0,
+                                "G": 0,
+                                "C": 0,
+                                "N": 0,
+                            }
+                        nuc_count_dict["mapped"][readlen][i][seq[i]] += 1
+
+                    for i in range(0, len(seq)):
+                        try:
+                            dinuc_count_dict[readlen][seq[i : i + 2]] += 1
+                        except:
+                            pass
+
+                    for i in range(len(seq), 0, -1):
+                        dist = i - len(seq)
+                        if dist not in threeprime_nuc_count_dict["mapped"][readlen]:
+                            threeprime_nuc_count_dict["mapped"][readlen][dist] = {
+                                "A": 0,
+                                "T": 0,
+                                "G": 0,
+                                "C": 0,
+                                "N": 0,
+                            }
+                        threeprime_nuc_count_dict["mapped"][readlen][dist][
+                            seq[dist]
+                        ] += 1
+                    ambiguously_mapped_reads += processor(
+                        process_chunk,
+                        master_read_dict,
+                        transcriptome_info_dict,
+                        master_dict,
+                        prev_seq,
+                        unambig_read_length_dict,
+                    )
+                process_chunk = {readname: [[tran, pos, read_tags]]}
+                prev_seq = read.seq
+        else:
+            unmapped_reads += 1
+
+            # Add this unmapped read to unmapped_dict so we can see what the most frequent unmapped read is.
+            seq = read.seq
+            readlen = len(seq)
+            if seq in unmapped_dict:
+                unmapped_dict[seq] += 1
+            else:
+                unmapped_dict[seq] = 1
+
+            # Populate the nuc_count_dict with this unmapped read
+            if readlen not in nuc_count_dict["unmapped"]:
+                nuc_count_dict["unmapped"][readlen] = {}
+            for i in range(0, len(seq)):
+                if i not in nuc_count_dict["unmapped"][readlen]:
+                    nuc_count_dict["unmapped"][readlen][i] = {
+                        "A": 0,
+                        "T": 0,
+                        "G": 0,
+                        "C": 0,
+                        "N": 0,
+                    }
+                nuc_count_dict["unmapped"][readlen][i][seq[i]] += 1
+
+            if readlen not in threeprime_nuc_count_dict["unmapped"]:
+                threeprime_nuc_count_dict["unmapped"][readlen] = {}
+
+            for i in range(len(seq), 0, -1):
+                dist = i - len(seq)
+                if dist not in threeprime_nuc_count_dict["unmapped"][readlen]:
+                    threeprime_nuc_count_dict["unmapped"][readlen][dist] = {
+                        "A": 0,
+                        "T": 0,
+                        "G": 0,
+                        "C": 0,
+                        "N": 0,
+                    }
+                threeprime_nuc_count_dict["unmapped"][readlen][dist][seq[dist]] += 1
+
+    # add stats about mapped/unmapped reads to file dict which will be used for the final report
+    master_dict["total_bam_lines"] = total_bam_lines
+    master_dict["mapped_reads"] = mapped_reads
+    master_dict["unmapped_reads"] = unmapped_reads
+    master_read_dict["unmapped_reads"] = unmapped_reads
+    master_dict["ambiguously_mapped_reads"] = ambiguously_mapped_reads
+
+    if "read_name" in master_read_dict:
+        del master_read_dict["read_name"]
+    print("BAM file processed")
+    print("Creating metagenes, triplet periodicity plots, etc.")
+    for tran in master_read_dict:
+        try:
+            cds_start = transcriptome_info_dict[tran]["cds_start"]
+            cds_stop = transcriptome_info_dict[tran]["cds_stop"]
+        except:
+            continue
+
+        tranlen = transcriptome_info_dict[tran]["length"]
+        # Use this to discard transcripts with no 5' leader or 3' trailer
+        if (
+            cds_start > 1
+            and cds_stop < tranlen
+            and transcriptome_info_dict[tran]["tran_type"] == 1
+        ):
+            for primetype in ["fiveprime", "threeprime"]:
+                # Create the triplet periodicity and metainfo plots based on both the 5' and 3' ends of reads
+                for readlength in master_read_dict[tran]["unambig"]:
+                    # print "readlength", readlength
+                    # for each fiveprime postion for this readlength within this transcript
+                    for raw_pos in master_read_dict[tran]["unambig"][readlength]:
+                        # print "raw pos", raw_pos
+                        trip_periodicity_reads += 1
+                        if primetype == "fiveprime":
+                            # get the five prime postion minus the cds start postion
+                            real_pos = raw_pos - cds_start
+                            rel_stop_pos = raw_pos - cds_stop
+                        elif primetype == "threeprime":
+                            real_pos = (raw_pos + readlength) - cds_start
+                            rel_stop_pos = (raw_pos + readlength) - cds_stop
+                        # get the readcount at the raw postion
+                        readcount = master_read_dict[tran]["unambig"][readlength][
+                            raw_pos
+                        ]
+                        # print "readcount", readcount
+                        frame = real_pos % 3
+                        if real_pos >= cds_start and real_pos <= cds_stop:
+                            if readlength in master_trip_dict[primetype]:
+                                master_trip_dict[primetype][readlength][
+                                    str(frame)
+                                ] += readcount
+                            else:
+                                master_trip_dict[primetype][readlength] = {
+                                    "0": 0.0,
+                                    "1": 0.0,
+                                    "2": 0.0,
+                                }
+                                master_trip_dict[primetype][readlength][
+                                    str(frame)
+                                ] += readcount
+                        # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
+                        if real_pos > (-600) and real_pos < (601):
+                            if readlength in master_offset_dict[primetype]:
+                                if (
+                                    real_pos
+                                    in master_offset_dict[primetype][readlength]
+                                ):
+                                    # print "real pos in offset dict"
+                                    master_offset_dict[primetype][readlength][
+                                        real_pos
+                                    ] += readcount
+                                else:
+                                    # print "real pos not in offset dict"
+                                    master_offset_dict[primetype][readlength][
+                                        real_pos
+                                    ] = readcount
+                            else:
+                                # initiliase with zero to avoid missing neighbours below
+                                # print "initialising with zeros"
+                                master_offset_dict[primetype][readlength] = {}
+                                for i in range(-600, 601):
+                                    master_offset_dict[primetype][readlength][i] = 0
+                                master_offset_dict[primetype][readlength][
+                                    real_pos
+                                ] += readcount
+
+                        # now populate offset dict with the 'real_positions' upstream of cds_start, these will be used for metainfo dict
+                        if rel_stop_pos > (-600) and rel_stop_pos < (601):
+                            if readlength in master_metagene_stop_dict[primetype]:
+                                if (
+                                    rel_stop_pos
+                                    in master_metagene_stop_dict[primetype][readlength]
+                                ):
+                                    master_metagene_stop_dict[primetype][readlength][
+                                        rel_stop_pos
+                                    ] += readcount
+                                else:
+                                    master_metagene_stop_dict[primetype][readlength][
+                                        rel_stop_pos
+                                    ] = readcount
+                            else:
+                                # initiliase with zero to avoid missing neighbours below
+                                master_metagene_stop_dict[primetype][readlength] = {}
+                                for i in range(-600, 601):
+                                    master_metagene_stop_dict[primetype][readlength][
+                                        i
+                                    ] = 0
+                                master_metagene_stop_dict[primetype][readlength][
+                                    rel_stop_pos
+                                ] += readcount
+
+    # master trip dict is now made up of readlengths with 3 frames and a count associated with each frame
+    # create a 'score' for each readlength by putting the max frame count over the second highest frame count
+    for primetype in ["fiveprime", "threeprime"]:
+        for subreadlength in master_trip_dict[primetype]:
+            maxcount = 0
+            secondmaxcount = 0
+            for frame in master_trip_dict[primetype][subreadlength]:
+                if master_trip_dict[primetype][subreadlength][frame] > maxcount:
+                    maxcount = master_trip_dict[primetype][subreadlength][frame]
+            for frame in master_trip_dict[primetype][subreadlength]:
+                if (
+                    master_trip_dict[primetype][subreadlength][frame] > secondmaxcount
+                    and master_trip_dict[primetype][subreadlength][frame] != maxcount
+                ):
+                    secondmaxcount = master_trip_dict[primetype][subreadlength][frame]
+            # a perfect score would be 0 meaning there is only a single peak, the worst score would be 1 meaning two highest peaks are the same height
+            master_trip_dict[primetype][subreadlength]["score"] = float(
+                secondmaxcount
+            ) / float(maxcount)
+    # This part is to determine what offsets to give each read length
+    print("Calculating offsets")
+    for primetype in ["fiveprime", "threeprime"]:
+        for readlen in master_offset_dict[primetype]:
+            accepted_len = False
+            max_relative_pos = 0
+            max_relative_count = 0
+            for relative_pos in master_offset_dict[primetype][readlen]:
+                # This line is to ensure we don't choose an offset greater than the readlength (in cases of a large peak far up/downstream)
+                if abs(relative_pos) < 10 or abs(relative_pos) > (readlen - 10):
+                    continue
+                if (
+                    master_offset_dict[primetype][readlen][relative_pos]
+                    > max_relative_count
+                ):
+                    max_relative_pos = relative_pos
+                    max_relative_count = master_offset_dict[primetype][readlen][
+                        relative_pos
+                    ]
+            # print "for readlen {} the max_relative pos is {}".format(readlen, max_relative_pos)
+            if primetype == "fiveprime":
+                # -3 to get from p-site to a-site, +1 to account for 1 based co-ordinates, resulting in -2 overall
+                final_offsets[primetype]["offsets"][readlen] = abs(max_relative_pos - 2)
+            elif primetype == "threeprime":
+                # +3 to get from p-site to a-site, -1 to account for 1 based co-ordinates, resulting in +2 overall
+                final_offsets[primetype]["offsets"][readlen] = (
+                    max_relative_pos * (-1)
+                ) + 2
+            # If there are no reads in CDS regions for a specific length, it may not be present in master_trip_dict
+            if readlen in master_trip_dict[primetype]:
+                final_offsets[primetype]["read_scores"][readlen] = master_trip_dict[
+                    primetype
+                ][readlen]["score"]
+            else:
+                final_offsets[primetype]["read_scores"][readlen] = 0.0
+    master_read_dict["offsets"] = final_offsets
+    master_read_dict["trip_periodicity"] = master_trip_dict
+    master_read_dict["desc"] = "Null"
+    master_read_dict["mapped_reads"] = mapped_reads
+    master_read_dict["nuc_counts"] = nuc_count_dict
+    master_read_dict["dinuc_counts"] = dinuc_count_dict
+    master_read_dict["threeprime_nuc_counts"] = threeprime_nuc_count_dict
+    master_read_dict["metagene_counts"] = master_offset_dict
+    master_read_dict["stop_metagene_counts"] = master_metagene_stop_dict
+    master_read_dict["read_lengths"] = read_length_dict
+    master_read_dict["unambig_read_lengths"] = unambig_read_length_dict
+    master_read_dict["coding_counts"] = master_dict["unambiguous_coding_count"]
+    master_read_dict["noncoding_counts"] = master_dict["unambiguous_non_coding_count"]
+    master_read_dict["ambiguous_counts"] = master_dict["ambiguously_mapped_reads"]
+    master_read_dict["frequent_unmapped_reads"] = (
+        sorted(unmapped_dict.items(), key=operator.itemgetter(1))
+    )[-2000:]
+    master_read_dict["cutadapt_removed"] = 0
+    master_read_dict["rrna_removed"] = 0
+    # If no reads are removed by minus m there won't be an entry in the log file, so initiliase with 0 first and change if there is a line
+    master_read_dict["removed_minus_m"] = 0
+    master_dict["removed_minus_m"] = 0
+    # We work out the total counts for 5', cds 3' for differential translation here, would be better to do thisn in processor but need the offsets
+    master_read_dict["unambiguous_all_totals"] = {}
+    master_read_dict["unambiguous_fiveprime_totals"] = {}
+    master_read_dict["unambiguous_cds_totals"] = {}
+    master_read_dict["unambiguous_threeprime_totals"] = {}
+
+    master_read_dict["ambiguous_all_totals"] = {}
+    master_read_dict["ambiguous_fiveprime_totals"] = {}
+    master_read_dict["ambiguous_cds_totals"] = {}
+    master_read_dict["ambiguous_threeprime_totals"] = {}
+    print("calculating transcript counts")
+    for tran in master_read_dict:
+        if tran in transcriptome_info_dict:
+            five_total = 0
+            cds_total = 0
+            three_total = 0
+
+            ambig_five_total = 0
+            ambig_cds_total = 0
+            ambig_three_total = 0
+
+            cds_start = transcriptome_info_dict[tran]["cds_start"]
+            cds_stop = transcriptome_info_dict[tran]["cds_stop"]
+            for readlen in master_read_dict[tran]["unambig"]:
+                if readlen in final_offsets["fiveprime"]["offsets"]:
+                    offset = final_offsets["fiveprime"]["offsets"][readlen]
+                else:
+                    offset = 15
+                for pos in master_read_dict[tran]["unambig"][readlen]:
+                    real_pos = pos + offset
+                    if real_pos < cds_start:
+                        five_total += master_read_dict[tran]["unambig"][readlen][pos]
+                    elif real_pos >= cds_start and real_pos <= cds_stop:
+                        cds_total += master_read_dict[tran]["unambig"][readlen][pos]
+                    elif real_pos > cds_stop:
+                        three_total += master_read_dict[tran]["unambig"][readlen][pos]
+            master_read_dict["unambiguous_all_totals"][tran] = (
+                five_total + cds_total + three_total
+            )
+            master_read_dict["unambiguous_fiveprime_totals"][tran] = five_total
+            master_read_dict["unambiguous_cds_totals"][tran] = cds_total
+            master_read_dict["unambiguous_threeprime_totals"][tran] = three_total
+
+            for readlen in master_read_dict[tran]["ambig"]:
+                if readlen in final_offsets["fiveprime"]["offsets"]:
+                    offset = final_offsets["fiveprime"]["offsets"][readlen]
+                else:
+                    offset = 15
+                for pos in master_read_dict[tran]["ambig"][readlen]:
+                    real_pos = pos + offset
+                    if real_pos < cds_start:
+                        ambig_five_total += master_read_dict[tran]["ambig"][readlen][
+                            pos
+                        ]
+                    elif real_pos >= cds_start and real_pos <= cds_stop:
+                        ambig_cds_total += master_read_dict[tran]["ambig"][readlen][pos]
+                    elif real_pos > cds_stop:
+                        ambig_three_total += master_read_dict[tran]["ambig"][readlen][
+                            pos
+                        ]
+
+            master_read_dict["ambiguous_all_totals"][tran] = (
+                five_total
+                + cds_total
+                + three_total
+                + ambig_five_total
+                + ambig_cds_total
+                + ambig_three_total
+            )
+            master_read_dict["ambiguous_fiveprime_totals"][tran] = (
+                five_total + ambig_five_total
+            )
+            master_read_dict["ambiguous_cds_totals"][tran] = cds_total + ambig_cds_total
+            master_read_dict["ambiguous_threeprime_totals"][tran] = (
+                three_total + ambig_three_total
+            )
+    print("Writing out to sqlite file")
+    sqlite_db = SqliteDict(outputfile, autocommit=False)
+    for key in master_read_dict:
+        sqlite_db[key] = master_read_dict[key]
+    sqlite_db["description"] = desc
+    sqlite_db.commit()
+    sqlite_db.close()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 2:
+        print(
+            "Usage: python bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)>"
+        )
+        sys.exit()
+    bam_filepath = sys.argv[1]
+    annotation_sqlite_filepath = sys.argv[2]
+    # try:
+    # 	desc = sys.argv[3]
+    # except:
+    # 	desc = bam_filepath.split("/")[-1]
+    outputfile = bam_filepath + "v2.sqlite"
+    process_bam(bam_filepath, annotation_sqlite_filepath, outputfile)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trips_bam_to_sqlite/trips_bam_to_sqlite.xml	Mon Apr 04 09:48:32 2022 +0000
@@ -0,0 +1,37 @@
+<tool id="bam_to_sqlite" name="BAM to Sqlite (TRIPS-Viz)" version="1.0">
+    <description>Convert BAM file to SQLITE for TRIPS-Viz</description>
+    <requirements>
+        <requirement type="package" version="0.19.0">pysam</requirement>
+        <requirement type="package" version="2.0.0">sqlitedict</requirement>
+        <requirement type="package" version="3.37.1">sqlite</requirement>
+    </requirements>
+    <command><![CDATA[
+        python $__tool_directory__/bam_to_sqlite.py <path_to_bam_file> <path_to_organism.sqlite> <file_description (optional)> 
+    ]]></command>
+    <inputs>
+        <param name="input1" type="data" format="bam" label="BAM file" />
+        <param name="input2" type="data" format="sqlite" label="Path to organism SQLITE annotation file" />
+        <param name="input3" type="data" format="text" label="Description of this sample" />
+    </inputs>
+    <outputs>
+       <data name="output1" format="sqlite"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input1" value="test.bam" ftype="bam"/>
+            <param name="input2" value="test.sqlite" ftype="sqlite"/>
+            <param name="input3" value="TEST DESCRIPTION" ftype="text"/>
+            <output name="output1" file="test.bam.sqlite" ftype="sqlite" lines_diff="4" />
+        </test>
+    </tests>
+    <help>
+        **What it does**
+
+        Process your transcriptome read alignments for TRIPS-Viz 
+
+        Prerequisites: 
+        - name-sorted bam file (samtools sort -n)
+        - TRIPS-Viz annotation file in SQLITE format.
+    </help>
+    <citations/>
+</tool>