Mercurial > repos > nikhil-joshi > sam2counts_edger
comparison sam2counts_galaxy_edger.py @ 0:ce3a667012c2 draft
Uploaded
| author | nikhil-joshi |
|---|---|
| date | Thu, 22 Jan 2015 03:54:22 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ce3a667012c2 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 """ | |
| 4 sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column | |
| 5 names that are the filenames, and rowname that are the reference | |
| 6 names. | |
| 7 """ | |
| 8 | |
| 9 VERSION = 0.90 | |
| 10 | |
| 11 import sys | |
| 12 import csv | |
| 13 from os import path | |
| 14 try: | |
| 15 import pysam | |
| 16 except ImportError: | |
| 17 sys.exit("pysam not installed; please install it\n") | |
| 18 | |
| 19 import argparse | |
| 20 | |
| 21 def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True): | |
| 22 """ | |
| 23 Take SAM filename, and create a hash of mapped and unmapped reads; | |
| 24 keys are reference sequences, values are the counts of occurences. | |
| 25 | |
| 26 Also, a hash of qualities (either 0 or >0) of mapped reads | |
| 27 is output, which is handy for diagnostics. | |
| 28 """ | |
| 29 counts = dict() | |
| 30 unique = dict() | |
| 31 nonunique = dict() | |
| 32 mode = 'r' | |
| 33 if ftype == "bam": | |
| 34 mode = 'rb' | |
| 35 sf = pysam.Samfile(filename, mode) | |
| 36 | |
| 37 if use_all_references: | |
| 38 # Make dictionary of all entries in header | |
| 39 try: | |
| 40 for sn in sf.header['SQ']: | |
| 41 if extra: | |
| 42 unique[sn['SN']] = 0 | |
| 43 nonunique[sn['SN']] = 0 | |
| 44 counts[sn['SN']] = 0 | |
| 45 except KeyError: | |
| 46 print "Sample file of sample " + sname + " does not have header." | |
| 47 | |
| 48 for read in sf: | |
| 49 if not read.is_unmapped: | |
| 50 id_name = sf.getrname(read.rname) if read.rname != -1 else 0 | |
| 51 | |
| 52 if not use_all_references and not counts.get(id_name, False): | |
| 53 ## Only make keys based on aligning reads, make empty hash | |
| 54 if extra: | |
| 55 unique[id_name] = 0 | |
| 56 nonunique[id_name] = 0 | |
| 57 ## initiate entry; even if not mapped, record 0 count | |
| 58 counts[id_name] = counts.get(id_name, 0) | |
| 59 | |
| 60 | |
| 61 counts[id_name] = counts.get(id_name, 0) + 1 | |
| 62 | |
| 63 if extra: | |
| 64 if read.mapq == 0: | |
| 65 nonunique[id_name] = nonunique[id_name] + 1 | |
| 66 else: | |
| 67 unique[id_name] = unique[id_name] + 1 | |
| 68 | |
| 69 if extra: | |
| 70 return {'counts':counts, 'unique':unique, 'nonunique':nonunique} | |
| 71 | |
| 72 return {'counts':counts} | |
| 73 | |
| 74 def collapsed_nested_count_dict(counts_dict, all_ids, order=None): | |
| 75 """ | |
| 76 Takes a nested dictionary `counts_dict` and `all_ids`, which is | |
| 77 built with the `table_dict`. All files (first keys) in | |
| 78 `counts_dict` are made into columns with order specified by | |
| 79 `order`. | |
| 80 | |
| 81 Output is a dictionary with keys that are the id's (genes or | |
| 82 transcripts), with values that are ordered counts. A header will | |
| 83 be created on the first row from the ordered columns (extracted | |
| 84 from filenames). | |
| 85 """ | |
| 86 if order is None: | |
| 87 col_order = counts_dict.keys() | |
| 88 else: | |
| 89 col_order = order | |
| 90 | |
| 91 collapsed_dict = dict() | |
| 92 for i, filename in enumerate(col_order): | |
| 93 for id_name in all_ids: | |
| 94 if not collapsed_dict.get(id_name, False): | |
| 95 collapsed_dict[id_name] = list() | |
| 96 | |
| 97 # get counts and append | |
| 98 c = counts_dict[filename].get(id_name, 0) | |
| 99 collapsed_dict[id_name].append(c) | |
| 100 return {'table':collapsed_dict, 'header':col_order} | |
| 101 | |
| 102 | |
| 103 def counts_to_file(table_dict, outfilename, delimiter=','): | |
| 104 """ | |
| 105 A function for its side-effect of writing `table_dict` (which | |
| 106 contains a table and header), to `outfilename` with the specified | |
| 107 `delimiter`. | |
| 108 """ | |
| 109 writer = csv.writer(open(outfilename, 'a'), delimiter=delimiter, lineterminator='\n') | |
| 110 table = table_dict['table'] | |
| 111 header = table_dict['header'] | |
| 112 | |
| 113 #header_row = True | |
| 114 for id_name, fields in table.items(): | |
| 115 #if header_row: | |
| 116 #row = ['id'] + header | |
| 117 #writer.writerow(row) | |
| 118 #header_row = False | |
| 119 | |
| 120 if id_name == 0: | |
| 121 continue | |
| 122 row = [id_name] | |
| 123 row.extend(fields) | |
| 124 writer.writerow(row) | |
| 125 | |
| 126 if __name__ == '__main__': | |
| 127 parser = argparse.ArgumentParser(description='parser for sam2counts') | |
| 128 parser.add_argument("-d", "--delimiter", help="the delimiter (default: tab)", default='\t') | |
| 129 parser.add_argument("-o", "--out-file", help="output filename (default: counts.txt)", default='counts.txt') | |
| 130 parser.add_argument("-u", "--extra-output", help="output extra information on non-unique and unique mappers (default: False)") | |
| 131 parser.add_argument("-r", "--use-all-references", dest="use_all_references", | |
| 132 help="Use all the references from the SAM header (default: True)", | |
| 133 default=True, action="store_false") | |
| 134 parser.add_argument("-f", "--extra-out-files", dest="extra_out_files", | |
| 135 help="comma-delimited filenames of unique and non-unique output " | |
| 136 "(default: unique.txt,nonunique.txt)", | |
| 137 default='unique.txt,nonunique.txt') | |
| 138 parser.add_argument("-v", "--verbose", dest="verbose", | |
| 139 help="enable verbose output") | |
| 140 parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True) | |
| 141 parser.add_argument("--group", help="group", nargs="+", action="append", required=True) | |
| 142 parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True) | |
| 143 parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True) | |
| 144 parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=['sam','bam']) | |
| 145 args = parser.parse_args() | |
| 146 | |
| 147 args.bam_file = [item for sublist in args.bam_file for item in sublist] | |
| 148 args.group = [item for sublist in args.group for item in sublist] | |
| 149 args.treatment = [item for sublist in args.treatment for item in sublist] | |
| 150 args.sample_name = [item for sublist in args.sample_name for item in sublist] | |
| 151 args.file_type = [item for sublist in args.file_type for item in sublist] | |
| 152 #print(args.sample_name) | |
| 153 | |
| 154 if (len(args.sample_name) != len(set(args.sample_name))): | |
| 155 parser.error("Sample names must be unique.") | |
| 156 | |
| 157 if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)): | |
| 158 parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.") | |
| 159 | |
| 160 file_counts = dict() | |
| 161 file_unique_counts = dict() | |
| 162 file_nonunique_counts = dict() | |
| 163 all_ids = list() | |
| 164 | |
| 165 ## do a pre-run check that all files exist | |
| 166 for full_filename in args.bam_file: | |
| 167 if not path.exists(full_filename): | |
| 168 parser.error("file '%s' does not exist" % full_filename) | |
| 169 | |
| 170 outf = open(args.out_file, "w") | |
| 171 outf.write("#") | |
| 172 for (g,t) in zip(args.group,args.treatment): | |
| 173 outf.write("\t" + g + ":" + t) | |
| 174 outf.write("\n#Feature") | |
| 175 for s in args.sample_name: | |
| 176 outf.write("\t" + s) | |
| 177 outf.write("\n") | |
| 178 outf.close() | |
| 179 | |
| 180 for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type): | |
| 181 ## read in SAM file, extract counts, and unpack counts | |
| 182 tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output, | |
| 183 use_all_references=args.use_all_references) | |
| 184 | |
| 185 if args.extra_output: | |
| 186 counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] | |
| 187 else: | |
| 188 counts = tmp['counts'] | |
| 189 | |
| 190 ## save counts, and unique/non-unique counts | |
| 191 file_counts[sn] = counts | |
| 192 | |
| 193 if args.extra_output: | |
| 194 file_unique_counts[sn] = unique | |
| 195 file_nonunique_counts[sn] = nonunique | |
| 196 | |
| 197 ## add all ids encountered in this in this file | |
| 198 all_ids.extend(file_counts[sn].keys()) | |
| 199 | |
| 200 ## Uniquify all_ids, and then take the nested file_counts | |
| 201 ## dictionary, collapse, and write to file. | |
| 202 all_ids = set(all_ids) | |
| 203 table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name) | |
| 204 counts_to_file(table_dict, args.out_file, delimiter=args.delimiter) | |
| 205 | |
| 206 if args.extra_output: | |
| 207 unique_fn, nonunique_fn = args.extra_out_files.split(',') | |
| 208 unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) | |
| 209 nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) | |
| 210 | |
| 211 counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter) | |
| 212 counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter) | |
| 213 |
