Mercurial > repos > nikhil-joshi > deseq_and_sam2counts
comparison deseq/sam2counts_galaxy.py @ 0:d7f27b43b8ff draft
Uploaded
author | nikhil-joshi |
---|---|
date | Thu, 05 Jul 2012 21:02:43 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d7f27b43b8ff |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 count.py -- Take SAM files and output a table of counts with column | |
5 names that are the filenames, and rowname that are the reference | |
6 names. | |
7 | |
8 Author: Vince Buffalo | |
9 Email: vsbuffaloAAAAA@gmail.com (with poly-A tail removed) | |
10 """ | |
11 | |
12 VERSION = 0.91 | |
13 | |
14 import sys | |
15 import csv | |
16 from os import path | |
17 try: | |
18 import pysam | |
19 except ImportError: | |
20 sys.exit("pysam not installed; please install it\n") | |
21 | |
22 from optparse import OptionParser | |
23 | |
24 def SAM_file_to_counts(filename, bam=False, extra=False, use_all_references=True): | |
25 """ | |
26 Take SAM filename, and create a hash of mapped and unmapped reads; | |
27 keys are reference sequences, values are the counts of occurences. | |
28 | |
29 Also, a hash of qualities (either 0 or >0) of mapped reads | |
30 is output, which is handy for diagnostics. | |
31 """ | |
32 counts = dict() | |
33 unique = dict() | |
34 nonunique = dict() | |
35 mode = 'r' | |
36 if bam: | |
37 mode = 'rb' | |
38 sf = pysam.Samfile(filename, mode) | |
39 | |
40 if use_all_references: | |
41 # Make dictionary of all entries in header | |
42 for sn in sf.header['SQ']: | |
43 if extra: | |
44 unique[sn['SN']] = 0 | |
45 nonunique[sn['SN']] = 0 | |
46 counts[sn['SN']] = 0 | |
47 | |
48 for read in sf: | |
49 if not read.is_unmapped: | |
50 id_name = sf.getrname(read.rname) if read.rname != -1 else 0 | |
51 | |
52 if not use_all_references and not counts.get(id_name, False): | |
53 ## Only make keys based on aligning reads, make empty hash | |
54 if extra: | |
55 unique[id_name] = 0 | |
56 nonunique[id_name] = 0 | |
57 ## initiate entry; even if not mapped, record 0 count | |
58 counts[id_name] = counts.get(id_name, 0) | |
59 | |
60 | |
61 counts[id_name] = counts.get(id_name, 0) + 1 | |
62 | |
63 if extra: | |
64 if read.mapq == 0: | |
65 nonunique[id_name] = nonunique[id_name] + 1 | |
66 else: | |
67 unique[id_name] = unique[id_name] + 1 | |
68 | |
69 if extra: | |
70 return {'counts':counts, 'unique':unique, 'nonunique':nonunique} | |
71 | |
72 return {'counts':counts} | |
73 | |
74 def collapsed_nested_count_dict(counts_dict, all_ids, order=None): | |
75 """ | |
76 Takes a nested dictionary `counts_dict` and `all_ids`, which is | |
77 built with the `table_dict`. All files (first keys) in | |
78 `counts_dict` are made into columns with order specified by | |
79 `order`. | |
80 | |
81 Output is a dictionary with keys that are the id's (genes or | |
82 transcripts), with values that are ordered counts. A header will | |
83 be created on the first row from the ordered columns (extracted | |
84 from filenames). | |
85 """ | |
86 if order is None: | |
87 col_order = counts_dict.keys() | |
88 else: | |
89 col_order = order | |
90 | |
91 collapsed_dict = dict() | |
92 for i, filename in enumerate(col_order): | |
93 for id_name in all_ids: | |
94 if not collapsed_dict.get(id_name, False): | |
95 collapsed_dict[id_name] = list() | |
96 | |
97 # get counts and append | |
98 c = counts_dict[filename].get(id_name, 0) | |
99 collapsed_dict[id_name].append(c) | |
100 return {'table':collapsed_dict, 'header':col_order} | |
101 | |
102 | |
103 def counts_to_file(table_dict, outfilename, delimiter=',', labels=''): | |
104 """ | |
105 A function for its side-effect of writing `table_dict` (which | |
106 contains a table and header), to `outfilename` with the specified | |
107 `delimiter`. | |
108 """ | |
109 writer = csv.writer(open(outfilename, 'w'), delimiter=delimiter) | |
110 table = table_dict['table'] | |
111 if labels: | |
112 header = labels.split(',') | |
113 else: | |
114 header = table_dict['header'] | |
115 | |
116 header_row = True | |
117 for id_name, fields in table.items(): | |
118 if header_row: | |
119 row = ['id'] + header | |
120 writer.writerow(row) | |
121 header_row = False | |
122 | |
123 if id_name == 0: | |
124 continue | |
125 row = [id_name] | |
126 row.extend(fields) | |
127 writer.writerow(row) | |
128 | |
129 if __name__ == '__main__': | |
130 parser = OptionParser() | |
131 parser.add_option("-d", "--delimiter", dest="delimiter", | |
132 help="the delimiter (default: tab)", default='\t') | |
133 parser.add_option("-o", "--out-file", dest="out_file", | |
134 help="output filename (default: counts.txt)", | |
135 default='counts.txt', action="store", type="string") | |
136 parser.add_option("-u", "--extra-output", dest="extra_out", | |
137 help="output extra information on non-unique and unique mappers (default: False)", | |
138 default=False, action="store_true") | |
139 parser.add_option("-b", "--bam", dest="bam", | |
140 help="all input files are BAM (default: False)", | |
141 default=False, action="store_true") | |
142 parser.add_option("-r", "--use-all-references", dest="use_all_references", | |
143 help="Use all the references from the SAM header (default: True)", | |
144 default=True, action="store_false") | |
145 parser.add_option("-f", "--extra-out-files", dest="extra_out_files", | |
146 help="comma-delimited filenames of unique and non-unique output " | |
147 "(default: unique.txt,nonunique.txt)", | |
148 default='unique.txt,nonunique.txt', action="store", type="string") | |
149 parser.add_option("-v", "--verbose", dest="verbose", | |
150 help="enable verbose output") | |
151 parser.add_option("-l", "--columns-labels", dest="col_labels", help="comma-delimited label names for samples", action="store", type="string") | |
152 | |
153 (options, args) = parser.parse_args() | |
154 | |
155 if len(args) < 1: | |
156 parser.error("one or more SAM files as arguments required") | |
157 | |
158 file_counts = dict() | |
159 file_unique_counts = dict() | |
160 file_nonunique_counts = dict() | |
161 all_ids = list() | |
162 files = [path.basename(f) for f in args] | |
163 | |
164 if options.col_labels and len(files) != len(options.col_labels.split(',')): | |
165 parser.error("Number of sample names does not equal number of files") | |
166 | |
167 if len(set(files)) != len(set(args)): | |
168 parser.error("file args must have unique base names (i.e. no foo/bar joo/bar)") | |
169 | |
170 ## do a pre-run check that all files exist | |
171 for full_filename in args: | |
172 if not path.exists(full_filename): | |
173 parser.error("file '%s' does not exist" % full_filename) | |
174 | |
175 for full_filename in args: | |
176 filename = path.basename(full_filename) | |
177 ## read in SAM file, extract counts, and unpack counts | |
178 tmp = SAM_file_to_counts(full_filename, bam=options.bam, extra=options.extra_out, | |
179 use_all_references=options.use_all_references) | |
180 | |
181 if options.extra_out: | |
182 counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique'] | |
183 else: | |
184 counts = tmp['counts'] | |
185 | |
186 ## save counts, and unique/non-unique counts | |
187 file_counts[filename] = counts | |
188 | |
189 if options.extra_out: | |
190 file_unique_counts[filename] = unique | |
191 file_nonunique_counts[filename] = nonunique | |
192 | |
193 ## add all ids encountered in this in this file | |
194 all_ids.extend(file_counts[filename].keys()) | |
195 | |
196 ## Uniquify all_ids, and then take the nested file_counts | |
197 ## dictionary, collapse, and write to file. | |
198 all_ids = set(all_ids) | |
199 table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=files) | |
200 counts_to_file(table_dict, options.out_file, delimiter=options.delimiter, labels=options.col_labels) | |
201 | |
202 if options.extra_out: | |
203 unique_fn, nonunique_fn = options.extra_out_files.split(',') | |
204 unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files) | |
205 nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files) | |
206 | |
207 counts_to_file(unique_table_dict, unique_fn, delimiter=options.delimiter) | |
208 counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=options.delimiter) | |
209 |