comparison deseq/sam2counts_galaxy.py @ 0:d7f27b43b8ff draft

Uploaded
author nikhil-joshi
date Thu, 05 Jul 2012 21:02:43 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d7f27b43b8ff
1 #!/usr/bin/env python
2
3 """
4 count.py -- Take SAM files and output a table of counts with column
5 names that are the filenames, and rowname that are the reference
6 names.
7
8 Author: Vince Buffalo
9 Email: vsbuffaloAAAAA@gmail.com (with poly-A tail removed)
10 """
11
12 VERSION = 0.91
13
14 import sys
15 import csv
16 from os import path
17 try:
18 import pysam
19 except ImportError:
20 sys.exit("pysam not installed; please install it\n")
21
22 from optparse import OptionParser
23
24 def SAM_file_to_counts(filename, bam=False, extra=False, use_all_references=True):
25 """
26 Take SAM filename, and create a hash of mapped and unmapped reads;
27 keys are reference sequences, values are the counts of occurences.
28
29 Also, a hash of qualities (either 0 or >0) of mapped reads
30 is output, which is handy for diagnostics.
31 """
32 counts = dict()
33 unique = dict()
34 nonunique = dict()
35 mode = 'r'
36 if bam:
37 mode = 'rb'
38 sf = pysam.Samfile(filename, mode)
39
40 if use_all_references:
41 # Make dictionary of all entries in header
42 for sn in sf.header['SQ']:
43 if extra:
44 unique[sn['SN']] = 0
45 nonunique[sn['SN']] = 0
46 counts[sn['SN']] = 0
47
48 for read in sf:
49 if not read.is_unmapped:
50 id_name = sf.getrname(read.rname) if read.rname != -1 else 0
51
52 if not use_all_references and not counts.get(id_name, False):
53 ## Only make keys based on aligning reads, make empty hash
54 if extra:
55 unique[id_name] = 0
56 nonunique[id_name] = 0
57 ## initiate entry; even if not mapped, record 0 count
58 counts[id_name] = counts.get(id_name, 0)
59
60
61 counts[id_name] = counts.get(id_name, 0) + 1
62
63 if extra:
64 if read.mapq == 0:
65 nonunique[id_name] = nonunique[id_name] + 1
66 else:
67 unique[id_name] = unique[id_name] + 1
68
69 if extra:
70 return {'counts':counts, 'unique':unique, 'nonunique':nonunique}
71
72 return {'counts':counts}
73
74 def collapsed_nested_count_dict(counts_dict, all_ids, order=None):
75 """
76 Takes a nested dictionary `counts_dict` and `all_ids`, which is
77 built with the `table_dict`. All files (first keys) in
78 `counts_dict` are made into columns with order specified by
79 `order`.
80
81 Output is a dictionary with keys that are the id's (genes or
82 transcripts), with values that are ordered counts. A header will
83 be created on the first row from the ordered columns (extracted
84 from filenames).
85 """
86 if order is None:
87 col_order = counts_dict.keys()
88 else:
89 col_order = order
90
91 collapsed_dict = dict()
92 for i, filename in enumerate(col_order):
93 for id_name in all_ids:
94 if not collapsed_dict.get(id_name, False):
95 collapsed_dict[id_name] = list()
96
97 # get counts and append
98 c = counts_dict[filename].get(id_name, 0)
99 collapsed_dict[id_name].append(c)
100 return {'table':collapsed_dict, 'header':col_order}
101
102
103 def counts_to_file(table_dict, outfilename, delimiter=',', labels=''):
104 """
105 A function for its side-effect of writing `table_dict` (which
106 contains a table and header), to `outfilename` with the specified
107 `delimiter`.
108 """
109 writer = csv.writer(open(outfilename, 'w'), delimiter=delimiter)
110 table = table_dict['table']
111 if labels:
112 header = labels.split(',')
113 else:
114 header = table_dict['header']
115
116 header_row = True
117 for id_name, fields in table.items():
118 if header_row:
119 row = ['id'] + header
120 writer.writerow(row)
121 header_row = False
122
123 if id_name == 0:
124 continue
125 row = [id_name]
126 row.extend(fields)
127 writer.writerow(row)
128
129 if __name__ == '__main__':
130 parser = OptionParser()
131 parser.add_option("-d", "--delimiter", dest="delimiter",
132 help="the delimiter (default: tab)", default='\t')
133 parser.add_option("-o", "--out-file", dest="out_file",
134 help="output filename (default: counts.txt)",
135 default='counts.txt', action="store", type="string")
136 parser.add_option("-u", "--extra-output", dest="extra_out",
137 help="output extra information on non-unique and unique mappers (default: False)",
138 default=False, action="store_true")
139 parser.add_option("-b", "--bam", dest="bam",
140 help="all input files are BAM (default: False)",
141 default=False, action="store_true")
142 parser.add_option("-r", "--use-all-references", dest="use_all_references",
143 help="Use all the references from the SAM header (default: True)",
144 default=True, action="store_false")
145 parser.add_option("-f", "--extra-out-files", dest="extra_out_files",
146 help="comma-delimited filenames of unique and non-unique output "
147 "(default: unique.txt,nonunique.txt)",
148 default='unique.txt,nonunique.txt', action="store", type="string")
149 parser.add_option("-v", "--verbose", dest="verbose",
150 help="enable verbose output")
151 parser.add_option("-l", "--columns-labels", dest="col_labels", help="comma-delimited label names for samples", action="store", type="string")
152
153 (options, args) = parser.parse_args()
154
155 if len(args) < 1:
156 parser.error("one or more SAM files as arguments required")
157
158 file_counts = dict()
159 file_unique_counts = dict()
160 file_nonunique_counts = dict()
161 all_ids = list()
162 files = [path.basename(f) for f in args]
163
164 if options.col_labels and len(files) != len(options.col_labels.split(',')):
165 parser.error("Number of sample names does not equal number of files")
166
167 if len(set(files)) != len(set(args)):
168 parser.error("file args must have unique base names (i.e. no foo/bar joo/bar)")
169
170 ## do a pre-run check that all files exist
171 for full_filename in args:
172 if not path.exists(full_filename):
173 parser.error("file '%s' does not exist" % full_filename)
174
175 for full_filename in args:
176 filename = path.basename(full_filename)
177 ## read in SAM file, extract counts, and unpack counts
178 tmp = SAM_file_to_counts(full_filename, bam=options.bam, extra=options.extra_out,
179 use_all_references=options.use_all_references)
180
181 if options.extra_out:
182 counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique']
183 else:
184 counts = tmp['counts']
185
186 ## save counts, and unique/non-unique counts
187 file_counts[filename] = counts
188
189 if options.extra_out:
190 file_unique_counts[filename] = unique
191 file_nonunique_counts[filename] = nonunique
192
193 ## add all ids encountered in this in this file
194 all_ids.extend(file_counts[filename].keys())
195
196 ## Uniquify all_ids, and then take the nested file_counts
197 ## dictionary, collapse, and write to file.
198 all_ids = set(all_ids)
199 table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=files)
200 counts_to_file(table_dict, options.out_file, delimiter=options.delimiter, labels=options.col_labels)
201
202 if options.extra_out:
203 unique_fn, nonunique_fn = options.extra_out_files.split(',')
204 unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files)
205 nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files)
206
207 counts_to_file(unique_table_dict, unique_fn, delimiter=options.delimiter)
208 counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=options.delimiter)
209