0
|
1 #!/usr/bin/python
|
|
2
|
|
3 """
|
|
4 sam2count_galaxy_edger.py -- Take SAM files and output a table of counts with column
|
|
5 names that are the filenames, and rowname that are the reference
|
|
6 names.
|
|
7 """
|
|
8
|
|
9 VERSION = 0.90
|
|
10
|
|
11 import sys
|
|
12 import csv
|
|
13 from os import path
|
|
14 try:
|
|
15 import pysam
|
|
16 except ImportError:
|
|
17 sys.exit("pysam not installed; please install it\n")
|
|
18
|
|
19 import argparse
|
|
20
|
|
21 def SAM_file_to_counts(filename, sname, ftype="sam", extra=False, use_all_references=True):
|
|
22 """
|
|
23 Take SAM filename, and create a hash of mapped and unmapped reads;
|
|
24 keys are reference sequences, values are the counts of occurences.
|
|
25
|
|
26 Also, a hash of qualities (either 0 or >0) of mapped reads
|
|
27 is output, which is handy for diagnostics.
|
|
28 """
|
|
29 counts = dict()
|
|
30 unique = dict()
|
|
31 nonunique = dict()
|
|
32 mode = 'r'
|
|
33 if ftype == "bam":
|
|
34 mode = 'rb'
|
|
35 sf = pysam.Samfile(filename, mode)
|
|
36
|
|
37 if use_all_references:
|
|
38 # Make dictionary of all entries in header
|
|
39 try:
|
|
40 for sn in sf.header['SQ']:
|
|
41 if extra:
|
|
42 unique[sn['SN']] = 0
|
|
43 nonunique[sn['SN']] = 0
|
|
44 counts[sn['SN']] = 0
|
|
45 except KeyError:
|
|
46 print "Sample file of sample " + sname + " does not have header."
|
|
47
|
|
48 for read in sf:
|
|
49 if not read.is_unmapped:
|
|
50 id_name = sf.getrname(read.rname) if read.rname != -1 else 0
|
|
51
|
|
52 if not use_all_references and not counts.get(id_name, False):
|
|
53 ## Only make keys based on aligning reads, make empty hash
|
|
54 if extra:
|
|
55 unique[id_name] = 0
|
|
56 nonunique[id_name] = 0
|
|
57 ## initiate entry; even if not mapped, record 0 count
|
|
58 counts[id_name] = counts.get(id_name, 0)
|
|
59
|
|
60
|
|
61 counts[id_name] = counts.get(id_name, 0) + 1
|
|
62
|
|
63 if extra:
|
|
64 if read.mapq == 0:
|
|
65 nonunique[id_name] = nonunique[id_name] + 1
|
|
66 else:
|
|
67 unique[id_name] = unique[id_name] + 1
|
|
68
|
|
69 if extra:
|
|
70 return {'counts':counts, 'unique':unique, 'nonunique':nonunique}
|
|
71
|
|
72 return {'counts':counts}
|
|
73
|
|
74 def collapsed_nested_count_dict(counts_dict, all_ids, order=None):
|
|
75 """
|
|
76 Takes a nested dictionary `counts_dict` and `all_ids`, which is
|
|
77 built with the `table_dict`. All files (first keys) in
|
|
78 `counts_dict` are made into columns with order specified by
|
|
79 `order`.
|
|
80
|
|
81 Output is a dictionary with keys that are the id's (genes or
|
|
82 transcripts), with values that are ordered counts. A header will
|
|
83 be created on the first row from the ordered columns (extracted
|
|
84 from filenames).
|
|
85 """
|
|
86 if order is None:
|
|
87 col_order = counts_dict.keys()
|
|
88 else:
|
|
89 col_order = order
|
|
90
|
|
91 collapsed_dict = dict()
|
|
92 for i, filename in enumerate(col_order):
|
|
93 for id_name in all_ids:
|
|
94 if not collapsed_dict.get(id_name, False):
|
|
95 collapsed_dict[id_name] = list()
|
|
96
|
|
97 # get counts and append
|
|
98 c = counts_dict[filename].get(id_name, 0)
|
|
99 collapsed_dict[id_name].append(c)
|
|
100 return {'table':collapsed_dict, 'header':col_order}
|
|
101
|
|
102
|
|
103 def counts_to_file(table_dict, outfilename, delimiter=','):
|
|
104 """
|
|
105 A function for its side-effect of writing `table_dict` (which
|
|
106 contains a table and header), to `outfilename` with the specified
|
|
107 `delimiter`.
|
|
108 """
|
|
109 writer = csv.writer(open(outfilename, 'a'), delimiter=delimiter, lineterminator='\n')
|
|
110 table = table_dict['table']
|
|
111 header = table_dict['header']
|
|
112
|
|
113 #header_row = True
|
|
114 for id_name, fields in table.items():
|
|
115 #if header_row:
|
|
116 #row = ['id'] + header
|
|
117 #writer.writerow(row)
|
|
118 #header_row = False
|
|
119
|
|
120 if id_name == 0:
|
|
121 continue
|
|
122 row = [id_name]
|
|
123 row.extend(fields)
|
|
124 writer.writerow(row)
|
|
125
|
|
126 if __name__ == '__main__':
|
|
127 parser = argparse.ArgumentParser(description='parser for sam2counts')
|
|
128 parser.add_argument("-d", "--delimiter", help="the delimiter (default: tab)", default='\t')
|
|
129 parser.add_argument("-o", "--out-file", help="output filename (default: counts.txt)", default='counts.txt')
|
|
130 parser.add_argument("-u", "--extra-output", help="output extra information on non-unique and unique mappers (default: False)")
|
|
131 parser.add_argument("-r", "--use-all-references", dest="use_all_references",
|
|
132 help="Use all the references from the SAM header (default: True)",
|
|
133 default=True, action="store_false")
|
|
134 parser.add_argument("-f", "--extra-out-files", dest="extra_out_files",
|
|
135 help="comma-delimited filenames of unique and non-unique output "
|
|
136 "(default: unique.txt,nonunique.txt)",
|
|
137 default='unique.txt,nonunique.txt')
|
|
138 parser.add_argument("-v", "--verbose", dest="verbose",
|
|
139 help="enable verbose output")
|
|
140 parser.add_argument("--bam-file", help="bam file", nargs="+", action="append", required=True)
|
|
141 parser.add_argument("--group", help="group", nargs="+", action="append", required=True)
|
|
142 parser.add_argument("--treatment", help="treatment", nargs="+", action="append", required=True)
|
|
143 parser.add_argument("--sample-name", help="sample name", nargs="+", action="append", required=True)
|
|
144 parser.add_argument("--file-type", help="file type", nargs="+", action="append", required=True, choices=['sam','bam'])
|
|
145 args = parser.parse_args()
|
|
146
|
|
147 args.bam_file = [item for sublist in args.bam_file for item in sublist]
|
|
148 args.group = [item for sublist in args.group for item in sublist]
|
|
149 args.treatment = [item for sublist in args.treatment for item in sublist]
|
|
150 args.sample_name = [item for sublist in args.sample_name for item in sublist]
|
|
151 args.file_type = [item for sublist in args.file_type for item in sublist]
|
|
152 #print(args.sample_name)
|
|
153
|
|
154 if (len(args.sample_name) != len(set(args.sample_name))):
|
|
155 parser.error("Sample names must be unique.")
|
|
156
|
|
157 if not(len(args.bam_file) == len(args.group) and len(args.group) == len(args.treatment) and len(args.treatment) == len(args.sample_name) and len(args.sample_name) == len(args.file_type)):
|
|
158 parser.error("Number of total BAM files, groups, treatments, sample names, and types must be the same.")
|
|
159
|
|
160 file_counts = dict()
|
|
161 file_unique_counts = dict()
|
|
162 file_nonunique_counts = dict()
|
|
163 all_ids = list()
|
|
164
|
|
165 ## do a pre-run check that all files exist
|
|
166 for full_filename in args.bam_file:
|
|
167 if not path.exists(full_filename):
|
|
168 parser.error("file '%s' does not exist" % full_filename)
|
|
169
|
|
170 outf = open(args.out_file, "w")
|
|
171 outf.write("#")
|
|
172 for (g,t) in zip(args.group,args.treatment):
|
|
173 outf.write("\t" + g + ":" + t)
|
|
174 outf.write("\n#Feature")
|
|
175 for s in args.sample_name:
|
|
176 outf.write("\t" + s)
|
|
177 outf.write("\n")
|
|
178 outf.close()
|
|
179
|
|
180 for (full_filename, sn, ft) in zip(args.bam_file, args.sample_name, args.file_type):
|
|
181 ## read in SAM file, extract counts, and unpack counts
|
|
182 tmp = SAM_file_to_counts(full_filename, sn, ftype=ft, extra=args.extra_output,
|
|
183 use_all_references=args.use_all_references)
|
|
184
|
|
185 if args.extra_output:
|
|
186 counts, unique, nonunique = tmp['counts'], tmp['unique'], tmp['nonunique']
|
|
187 else:
|
|
188 counts = tmp['counts']
|
|
189
|
|
190 ## save counts, and unique/non-unique counts
|
|
191 file_counts[sn] = counts
|
|
192
|
|
193 if args.extra_output:
|
|
194 file_unique_counts[sn] = unique
|
|
195 file_nonunique_counts[sn] = nonunique
|
|
196
|
|
197 ## add all ids encountered in this in this file
|
|
198 all_ids.extend(file_counts[sn].keys())
|
|
199
|
|
200 ## Uniquify all_ids, and then take the nested file_counts
|
|
201 ## dictionary, collapse, and write to file.
|
|
202 all_ids = set(all_ids)
|
|
203 table_dict = collapsed_nested_count_dict(file_counts, all_ids, order=args.sample_name)
|
|
204 counts_to_file(table_dict, args.out_file, delimiter=args.delimiter)
|
|
205
|
|
206 if args.extra_output:
|
|
207 unique_fn, nonunique_fn = args.extra_out_files.split(',')
|
|
208 unique_table_dict = collapsed_nested_count_dict(file_unique_counts, all_ids, order=files)
|
|
209 nonunique_table_dict = collapsed_nested_count_dict(file_nonunique_counts, all_ids, order=files)
|
|
210
|
|
211 counts_to_file(unique_table_dict, unique_fn, delimiter=args.delimiter)
|
|
212 counts_to_file(nonunique_table_dict, nonunique_fn, delimiter=args.delimiter)
|
|
213
|