annotate allele-counts.py @ 5:31361191d2d2

Uploaded tarball. Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author nick
date Thu, 12 Sep 2013 11:34:23 -0400
parents 318fdf77aa54
children df3b28364cd2
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
4 # Or, to put it briefly,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
6 #
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
7 # New in this version:
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
8 # Made header line customizable
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
9 # - separate from internal column labels, which are used as dict keys
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
10 import os
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
11 import sys
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
12 import random
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
13 from optparse import OptionParser
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
14
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
19 cat variants.vcf | %prog [options] > alleles.csv"""
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
22 'debug_loc':'', 'seed':''}
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
24 EPILOG = """Requirements:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
25 The input VCF must report the variants for each strand.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
26 The variants should be case-sensitive (e.g. all capital base letters).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
28
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
29
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
30 def get_options(defaults, usage, description='', epilog=''):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
31 """Get options, print usage text."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
32
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
34
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
35 parser.add_option('-i', '--infile', dest='infile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
36 default=defaults.get('infile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
37 help='Read input VCF data from this file instead of stdin.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
38 parser.add_option('-o', '--outfile', dest='outfile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
39 default=defaults.get('outfile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
40 help='Print output data to this file instead of stdout.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
42 default=defaults.get('freq_thres'),
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
43 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
44 +'= 1% frequency. Default is %default%.'))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
45 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
46 default=defaults.get('covg_thres'),
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
47 help=('Coverage threshold. Each site must be supported by at least this '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
48 +'many reads on each strand. Otherwise the site will not be printed in '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
49 +'the output. The default is %default reads per strand.'))
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
50 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
51 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'),
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
52 help=('Operate without a frequency threshold or coverage threshold. '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
53 +'Equivalent to "-c 0 -f 0".'))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
54 parser.add_option('-H', '--header', dest='print_header', action='store_const',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
55 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
56 help=('Print header line. This is a #-commented line with the column '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
57 +'labels. Off by default.'))
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
58 parser.add_option('-s', '--stranded', dest='stranded', action='store_const',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
59 const=not(defaults.get('stranded')), default=defaults.get('stranded'),
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
60 help='Report variant counts by strand, in separate columns. Off by default.')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
61 parser.add_option('-r', '--rand-seed', dest='seed',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
62 default=defaults.get('seed'), help=('Seed for random number generator.'))
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
63 parser.add_option('-d', '--debug', dest='debug_loc',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
64 default=defaults.get('debug_loc'),
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
65 help=('Turn on debug mode and specify a single site to process using UCSC '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
66 +'coordinate format. You can also append a sample ID after another ":" '
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
67 +'to restrict it further.'))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
68
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
69 (options, args) = parser.parse_args()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
70
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
71 return (options, args)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
72
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
73
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
74 def main():
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
75
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
77
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
78 infile = options.infile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
79 outfile = options.outfile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
80 print_header = options.print_header
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
81 freq_thres = options.freq_thres / 100.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
82 covg_thres = options.covg_thres
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
83 stranded = options.stranded
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
84 debug_loc = options.debug_loc
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
85 seed = options.seed
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
86
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
87 if options.no_filter:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
88 freq_thres = 0
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
89 covg_thres = 0
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
90
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
91 if seed:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
92 random.seed(seed)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
93
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
94 debug = False
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
95 print_sample = ''
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
96 if debug_loc:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
97 debug = True
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
98 coords = debug_loc.split(':')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
99 print_chr = coords[0]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
100 print_pos = ''
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
101 if len(coords) > 1: print_pos = coords[1]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
102 if len(coords) > 2: print_sample = coords[2]
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
103
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
104 # set infile_handle to either stdin or the input file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
105 if infile == OPT_DEFAULTS.get('infile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
106 infile_handle = sys.stdin
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
107 sys.stderr.write("Reading from standard input..\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
108 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
109 if os.path.exists(infile):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
110 infile_handle = open(infile, 'r')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
111 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
112 fail('Error: Input VCF file '+infile+' not found.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
113
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
114 # set outfile_handle to either stdout or the output file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
115 if outfile == OPT_DEFAULTS.get('outfile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
116 outfile_handle = sys.stdout
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
117 else:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
118 try:
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
119 outfile_handle = open(outfile, 'w')
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
120 except IOError, e:
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
121 fail('Error: The given output filename '+outfile+' could not be opened.')
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
122
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
123 # Take care of column names, print header
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
124 if len(COLUMNS) != len(COLUMN_LABELS):
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
125 fail('Error: Internal column names list do not match column labels list.')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
126 if stranded:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
127 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
128 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
129 if print_header:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
130 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
131
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
132 # main loop
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
133 # each iteration processes one VCF line and prints one or more output lines
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
134 # one VCF line = one site, one or more samples
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
135 # one output line = one site, one sample
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
136 sample_names = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
137 for line in infile_handle:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
138 line = line.rstrip('\r\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
139
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
140 # header lines
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
141 if line[0] == '#':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
142 if line[0:6].upper() == '#CHROM':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
143 sample_names = line.split('\t')[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
144 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
145
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
146 if not sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
147 fail("Error in input VCF: Data line encountered before header line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
148 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
149
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
150 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
151
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
152 if debug:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
153 if print_pos != site_data['pos']:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
154 continue
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
155 if print_chr != site_data['chr'] and print_chr != '':
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
156 continue
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
157 if print_sample != '':
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
158 for sample in site_data['samples'].keys():
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
159 if sample.lower() != print_sample.lower():
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
160 site_data['samples'].pop(sample, None)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
161 if len(site_data['samples']) == 0:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
162 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
163 sys.exit(1)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
164
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
165
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
166 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
167 freq_thres, covg_thres, stranded, debug=debug)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
168
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
169 if debug and site_summary[0]['print']:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
170 print line.split('\t')[9].split(':')[-1]
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
171
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
172 print_site(outfile_handle, site_summary, COLUMNS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
173
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
174 # close any open filehandles
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
175 if infile_handle is not sys.stdin:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
176 infile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
177 if outfile_handle is not sys.stdout:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
178 outfile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
179
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
180 # keeps Galaxy from giving an error if there were messages on stderr
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
181 sys.exit(0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
182
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
183
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
184
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
185 def read_site(line, sample_names, canonical):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
186 """Read in a line, parse the variants into a data structure, and return it.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
187 The line should be actual site data, not a header line, so check beforehand.
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
188 Only the variants in 'canonical' will be read; all others are ignored.
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
189 Note: the line is assumed to have been chomped.
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
190 The returned data is stored in a dict, with the following structure:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
191 {
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
192 'chr': 'chr1',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
193 'pos': '2617',
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
194 'samples': {
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
195 'THYROID': {
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
196 '+A': 32,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
197 '-A': 45,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
198 '-G': 1,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
199 },
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
200 'BLOOD': {
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
201 '+A': 2,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
202 '-C': 1,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
203 '+G': 37,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
204 '-G': 42,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
205 },
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
206 },
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
207 }
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
208 """
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
209
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
210 site = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
211 fields = line.split('\t')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
212
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
213 if len(fields) < 9:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
214 fail("Error in input VCF: wrong number of fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
215 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
216
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
217 site['chr'] = fields[0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
218 site['pos'] = fields[1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
219 samples = fields[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
220
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
221 if len(samples) < len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
222 fail("Error in input VCF: missing sample fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
223 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
224 elif len(samples) > len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
225 fail("Error in input VCF: more sample fields in data line than in header. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
226 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
227
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
228 sample_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
229 for i in range(len(samples)):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
230
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
231 variant_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
232 counts = samples[i].split(':')[-1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
233 counts = counts.split(',')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
234
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
235 for count in counts:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
236 if not count:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
237 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
238 fields = count.split('=')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
239 if len(fields) != 2:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
240 fail("Error in input VCF: Incorrect variant data format (must contain "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
241 +"a single '='). Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
242 (variant, reads) = fields
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
243 if variant[1:] not in canonical:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
244 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
245 if variant[0] != '-' and variant[0] != '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
246 fail("Error in input VCF: variant data not strand-specific. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
247 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
248 try:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
249 variant_counts[variant] = int(float(reads))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
250 except ValueError, e:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
251 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
252
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
253 sample_counts[sample_names[i]] = variant_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
254
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
255 site['samples'] = sample_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
256
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
257 return site
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
258
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
259
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
260 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
261 stranded=False, debug=False):
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
262 """Take the raw data from the VCF line and transform it into the summary data
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
263 to be printed in the output format."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
264
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
265 site_summary = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
266 for sample_name in sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
267
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
268 sample = {'print':False}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
269 variants = site['samples'].get(sample_name)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
270
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
271 sample['sample'] = sample_name
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
272 sample['chr'] = site['chr']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
273 sample['pos'] = site['pos']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
274
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
275 coverage = sum(variants.values())
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
276
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
277 # get stranded coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
278 covg_plus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
279 covg_minus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
280 for variant in variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
281 if variant[0] == '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
282 covg_plus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
283 elif variant[0] == '-':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
284 covg_minus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
285 # stranded coverage threshold
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
286 if covg_plus < covg_thres or covg_minus < covg_thres:
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
287 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
288 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
289 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
290 sample['print'] = True
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
291
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
292 # get an ordered list of read counts for all variants (both strands)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
293 bases = get_read_counts(variants, '+-')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
294 ranked_bases = process_read_counts(bases, sort=True, debug=debug)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
295
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
296 # prepare stranded or unstranded lists of base counts
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
297 base_count_lists = []
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
298 if stranded:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
299 strands = ['+', '-']
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
300 base_count_lists.append(get_read_counts(variants, '+'))
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
301 base_count_lists.append(get_read_counts(variants, '-'))
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
302 else:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
303 strands = ['']
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
304 base_count_lists.append(ranked_bases)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
305
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
306 # record read counts into output dict
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
307 # If stranded, this will loop twice, once for each strand, and prepend '+'
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
308 # or '-' to the base name. If not stranded, it will loop once, and prepend
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
309 # nothing ('').
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
310 for (strand, base_count_list) in zip(strands, base_count_lists):
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
311 for base_count in base_count_list:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
312 sample[strand+base_count[0]] = base_count[1]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
313 # fill in any zeros
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
314 for base in canonical:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
315 if not sample.has_key(strand+base):
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
316 sample[strand+base] = 0
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
317
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
318 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
319
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
320 # If there's a tie for 2nd, randomly choose one to be 2nd
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
321 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
322 swap = random.choice([True,False])
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
323 if swap:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
324 tmp_base = ranked_bases[1]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
325 ranked_bases[1] = ranked_bases[2]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
326 ranked_bases[2] = tmp_base
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
327
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
328 if debug: print "ranked +-: "+str(ranked_bases)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
329
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
330 sample['coverage'] = coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
331 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
332 sample['major'] = ranked_bases[0][0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
333 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
334 sample['major'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
335 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
336 sample['minor'] = ranked_bases[1][0]
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
337 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
338 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
339 sample['minor'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
340 sample['freq'] = 0.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
341
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
342 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
343
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
344 return site_summary
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
345
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
346
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
347 def get_read_counts(stranded_counts, strands='+-'):
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
348 """Do a simple sum of the read counts per variant, on the specified strands.
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
349 Arguments:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
350 stranded_counts: Dict of the stranded variants (keys) and their read counts
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
351 (values).
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
352 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
353 Return value:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
354 summed_counts: A list of the alleles and their read counts. The elements are
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
355 tuples (variant, read count)."""
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
356
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
357 variants = stranded_counts.keys()
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
358
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
359 summed_counts = {}
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
360 for variant in variants:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
361 strand = variant[0]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
362 base = variant[1:]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
363 if strand in strands:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
364 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
365
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
366 return summed_counts.items()
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
367
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
368
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
369 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
370 """Process a list of read counts by frequency filtering and/or sorting.
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
371 Arguments:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
372 variant_counts: List of the non-stranded variants and their read counts. The
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
373 elements are tuples (variant, read count).
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
374 freq_thres: The frequency threshold each allele needs to pass to be included.
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
375 sort: Whether to sort the list in descending order of read counts.
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
376 Return value:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
377 variant_counts: A list of the processed alleles and their read counts. The
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
378 elements are tuples (variant, read count)."""
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
379
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
380 # get coverage for the specified strands
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
381 coverage = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
382 for variant in variant_counts:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
383 coverage += variant[1]
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
384
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
385 if coverage <= 0:
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
386 return []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
387
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
388 # sort the list of alleles by read count
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
389 if sort:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
390 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
391
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
392 if debug:
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
394 for variant in variant_counts:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
396 str(variant[1]/float(coverage)))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
397
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
398 # remove bases below the frequency threshold
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
399 if freq_thres > 0:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
400 variant_counts = [variant for variant in variant_counts
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
401 if variant[1]/float(coverage) >= freq_thres]
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
402
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
403 return variant_counts
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
404
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
405
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
406 def count_alleles(variant_counts, freq_thres, debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
407 """Determine how many alleles to report, based on filtering rules.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
408 The current rule determines which bases pass the frequency threshold on each
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
409 strand individually, then compares the two sets of bases. If they are the same
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
410 (regardless of order), the allele count is the number of bases. Otherwise it
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
411 is zero."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
412 allele_count = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
413
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
414 alleles_plus = get_read_counts(variant_counts, '+')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
415 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
416 sort=False, debug=debug)
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
417 alleles_minus = get_read_counts(variant_counts, '-')
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
418 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
419 sort=False, debug=debug)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
420
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
421 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
422 print '+ '+str(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
423 print '- '+str(alleles_minus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
424
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
425 # Check if each strand reports the same set of alleles.
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
426 # Sorting by base is to compare lists without regard to order (as sets).
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
427 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
428 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
429 if alleles_plus_sorted == alleles_minus_sorted:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
430 allele_count = len(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
431
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
432 return allele_count
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
433
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
434
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
435 def print_site(filehandle, site, columns):
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
436 """Print the output lines for one site (one per sample).
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
437 filehandle must be open."""
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
438 for sample in site:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
439 if sample['print']:
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
440 fields = [str(sample.get(column)) for column in columns]
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
441 filehandle.write('\t'.join(fields)+"\n")
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
442
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
443
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
444 def fail(message):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
445 sys.stderr.write(message+'\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
446 sys.exit(1)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
447
5
31361191d2d2 Uploaded tarball.
nick
parents: 2
diff changeset
448
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
449 if __name__ == "__main__":
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
450 main()