# HG changeset patch # User nikhil-joshi # Date 1421916862 18000 # Node ID ce3a667012c2579642e2c2fcb32cedb898bcb5e4 Uploaded diff -r 000000000000 -r ce3a667012c2 sam2counts_galaxy_edger.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sam2counts_galaxy_edger.py Thu Jan 22 03:54:22 2015 -0500 @@ -0,0 +1,213 @@ +#!/usr/bin/python + +""" +sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column +names that are the filenames, and rowname that are the reference +names. +""" + +VERSION = 0.90 + +import sys +import csv +from os import path +try: + import pysam +except ImportError: + sys.exit("pysam not installed; please install it\n") + +import argparse + +def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True): + """ + Take SAM filename, and create a hash of mapped and unmapped reads; + keys are reference sequences, values are the counts of occurences. + + Also, a hash of qualities (either 0 or >0) of mapped reads + is output, which is handy for diagnostics. + """ + counts = dict() + unique = dict() + nonunique = dict() + mode = 'r' + if ftype == "bam": + mode = 'rb' + sf = pysam.Samfile(filename, mode) + + if use_all_references: + # Make dictionary of all entries in header + try: + for sn in sf.header['SQ']: + if extra: + unique[sn['SN']] = 0 + nonunique[sn['SN']] = 0 + counts[sn['SN']] = 0 + except KeyError: + print "Sample file of sample " + sname + " does not have header." + + for read in sf: + if not read.is_unmapped: + id_name = sf.getrname(read.rname) if read.rname != -1 else 0 + + if not use_all_references and not counts.get(id_name, False): + ## Only make keys based on aligning reads, make empty hash + if extra: + unique[id_name] = 0 + nonunique[id_name] = 0 + ## initiate entry; even if not mapped, record 0 count + counts[id_name] = counts.get(id_name, 0) + + + counts[id_name] = counts.get(id_name, 0) + 1 + + if extra: + if read.mapq == 0: + nonunique[id_name] = nonunique[id_name] + 1 + else: + unique[id_name] = unique[id_name] + 1 + + if extra: + return {'counts':counts, 'unique':unique, 'nonunique':nonunique} + + return {'counts':counts} + +def collapsed_nested_count_dict(counts_dict, all_ids, order=None): + """ + Takes a nested dictionary `counts_dict` and `all_ids`, which is + built with the `table_dict`. All files (first keys) in + `counts_dict` are made into columns with order specified by + `order`. + + Output is a dictionary with keys that are the id's (genes or + transcripts), with values that are ordered counts. A header will + be created on the first row from the ordered columns (extracted + from filenames). + """ + if order is None: + col_order = counts_dict.keys() + else: + col_order = order + + collapsed_dict = dict() + for i, filename in enumerate(col_order): + for id_name in all_ids: + if not collapsed_dict.get(id_name, False): + collapsed_dict[id_name] = list() + + # get counts and append + c = counts_dict[filename].get(id_name, 0) + collapsed_dict[id_name].append(c) + return {'table':collapsed_dict, 'header':col_order} + + +def counts_to_file(table_dict, outfilename, delimiter=','): + """ + A function for its side-effect of writing `table_dict` (which + contains a table and header), to `outfilename` with the specified + `delimiter`. + """ + writer = csv.writer(open(outfilename, 'a'), delimiter=delimiter, lineterminator='\n') + table = table_dict['table'] + header = table_dict['header'] + + #header_row = True + for id_name, fields in table.items(): + #if header_row: + #row = ['id'] + header + #writer.writerow(row) + #header_row = False + + if id_name == 0: + continue + row = [id_name] + row.extend(fields) + writer.writerow(row) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='parser for sam2counts') + parser.add_argument("-d", "--delimiter", help="the delimiter (default: tab)", default='\t') + parser.add_argument("-o", "--out-file", help="output filename (default: counts.txt)", default='counts.txt') + parser.add_argument("-u", "--extra-output", help="output extra information on non-unique and unique mappers (default: False)") + parser.add_argument("-r", "--use-all-references", dest="use_all_references", + help="Use all the references from the SAM header (default: True)", + default=True, action="store_false") + parser.add_argument("-f", "--extra-out-files", dest="extra_out_files", + help="comma-delimited filenames of unique and non-unique output " + "(default: unique.txt,nonunique.txt)", + default='unique.txt,nonunique.txt') + parser.add_argument("-v", "--verbose", dest="verbose", + help="enable verbose output") + parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True) + parser.add_argument("--group", help="group", nargs="+", action="append", required=True) + parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True) + parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True) + parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=['sam','bam']) + args = parser.parse_args() + + args.bam_file = [item for sublist in args.bam_file for item in sublist] + args.group = [item for sublist in args.group for item in sublist] + args.treatment = [item for sublist in args.treatment for item in sublist] + args.sample_name = [item for sublist in args.sample_name for item in sublist] + args.file_type = [item for sublist in args.file_type for item in sublist] + #print(args.sample_name) + + if (len(args.sample_name) != len(set(args.sample_name))): + parser.error("Sample names must be unique.") + + if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)): + parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.") + + file_counts = dict() + file_unique_counts = dict() + file_nonunique_counts = dict() + all_ids = list() + + ## do a pre-run check that all files exist + for full_filename in args.bam_file: + if not path.exists(full_filename): + parser.error("file '%s' does not exist" % full_filename) + + outf = open(args.out_file, "w") + outf.write("#") + for (g,t) in zip(args.group,args.treatment): + outf.write("\t" + g + ":" + t) + outf.write("\n#Feature") + for s in args.sample_name: + outf.write("\t" + s) + outf.write("\n") + outf.close() + + for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type): + ## read in SAM file, extract counts, and unpack counts + tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output, + use_all_references=args.use_all_references) + + if args.extra_output: + counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] + else: + counts = tmp['counts'] + + ## save counts, and unique/non-unique counts + file_counts[sn] = counts + + if args.extra_output: + file_unique_counts[sn] = unique + file_nonunique_counts[sn] = nonunique + + ## add all ids encountered in this in this file + all_ids.extend(file_counts[sn].keys()) + + ## Uniquify all_ids, and then take the nested file_counts + ## dictionary, collapse, and write to file. + all_ids = set(all_ids) + table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name) + counts_to_file(table_dict, args.out_file, delimiter=args.delimiter) + + if args.extra_output: + unique_fn, nonunique_fn = args.extra_out_files.split(',') + unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) + nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) + + counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter) + counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter) +