8
|
1 #!/usr/bin/env python
|
|
2 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
3 #
|
|
4 #usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
|
|
5 #
|
|
6 #Given a table with minor and major alleles per position, it generates the
|
|
7 #minor and major allele sequences in FASTA format
|
|
8 #
|
|
9 #positional arguments:
|
|
10 # alleles Table containing minor and major allele base per
|
|
11 # position. cols: [id, chr, pos, A, C, G, T, cvrg,
|
|
12 # plody, major, minor, freq_minor]
|
|
13 #
|
|
14 #optional arguments:
|
|
15 # -h, --help show this help message and exit
|
|
16 # -l INT, --seq-length INT
|
|
17 # Background sequence length. Bases in an artifical
|
|
18 # all-N-sequence of length INT will be replaced by
|
|
19 # either the major or minor allele base accordingly
|
|
20 # -j FILE, --major-seq FILE
|
|
21 # File to write major allele sequences in FASTA multiple
|
|
22 # alignment format.
|
|
23 # -d DIR, --minor-dir DIR
|
|
24 # Per sample minor allele sequences will be written to
|
|
25 # this directory
|
|
26 #
|
|
27 # The expected columns in the alleles table follow Nicholas Stoler's
|
|
28 # Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed
|
|
29 # http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
|
|
30 #
|
|
31 # Expected columns:
|
|
32 # 1. sample_id
|
|
33 # 2. chr
|
|
34 # 3. position
|
|
35 # 4 counts for A's
|
|
36 # 5. counts for C's
|
|
37 # 6. counts for G's
|
|
38 # 7. counts for T's
|
|
39 # (8. counts for a's)
|
|
40 # (9. counts for c's)
|
|
41 # (10. counts for g's)
|
|
42 # (11. counts for t's)
|
|
43 # 8. (12.) Coverage
|
|
44 # 9. (13.) Number of alleles passing a given criteria
|
|
45 # 10. (14.) Major allele
|
|
46 # 11. (15.) Minor allele
|
|
47 # 12. (16.) Minor allele frequency in position
|
|
48
|
|
49 import sys
|
|
50 import os
|
|
51 import argparse
|
|
52
|
|
53 def createseq(sample, allele, seq_size, table):
|
|
54 """Generate major or minor allele sequence"""
|
|
55 out_sequence = ['N' for i in range(seq_size)]
|
|
56 sample_data = [line for line in table if line[0] == sample]
|
|
57
|
|
58 for entry in sample_data:
|
|
59 position = int(entry[2])
|
|
60 if len(entry)==12:
|
|
61 number_of_alleles = int(entry[8])
|
|
62 major_allele = entry[9].strip()
|
|
63 minor_allele = entry[10].strip()
|
|
64 else:
|
|
65 number_of_alleles = int(entry[12])
|
|
66 major_allele = entry[13].strip()
|
|
67 minor_allele = entry[14].strip()
|
|
68
|
|
69 if allele == 'major':
|
|
70 out_sequence[position-1] = major_allele
|
|
71 elif allele == 'minor':
|
|
72 if number_of_alleles >= 2:
|
|
73 out_sequence[position-1] = minor_allele
|
|
74 else:
|
|
75 out_sequence[position-1] = major_allele
|
|
76 return out_sequence
|
|
77
|
|
78 def printseq(sample,allele,seq,output):
|
|
79 """Print out sequence"""
|
|
80 #print >> output, '>{0}_{1}'.format(sample,allele)
|
|
81 print >> output, '>{0}{1}'.format(sample,allele)
|
|
82 for i in range(0,len(seq),70):
|
|
83 print >> output, ''.join(seq[i:i+70])
|
|
84
|
|
85 def main():
|
|
86 parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
|
87 parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
|
|
88 parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
|
|
89 parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
|
|
90 parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
|
|
91 parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
|
|
92 args = parser.parse_args()
|
|
93
|
|
94
|
|
95 try:
|
|
96 table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
|
|
97 samples = sorted(list(set([ line[0] for line in table ])))
|
|
98 except:
|
|
99 sys.exit('\nERROR: Could not open %s\n' % args.alleles)
|
|
100 try:
|
|
101 major_out = open(args.major_seq, 'w+')
|
|
102 except:
|
|
103 sys.exit('\nCould not create %s\n' % args.major_seq)
|
|
104
|
|
105 # Single file for all major allele sequences in FASTA multiple alignment
|
|
106 for sample in samples:
|
|
107 sequence = createseq(sample,'major',args.seq_length,table)
|
|
108 #printseq(sample,'major',sequence,major_out)
|
|
109 printseq(sample,'',sequence,major_out)
|
|
110 major_out.close()
|
|
111
|
|
112 # Sample specific minor allele sequence in FASTA format
|
|
113 try:
|
|
114 os.makedirs(args.minor_dir)
|
|
115 except:
|
|
116 pass
|
|
117
|
|
118 for sample in samples:
|
|
119 if args.minor_prefix: # to fit Galaxy requirements
|
|
120 name = sample.replace('_','')
|
|
121 minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
|
|
122 else: # for non-Galaxy
|
|
123 minor_name = sample+'-minor.fa'
|
|
124 minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
|
|
125 sequence = createseq(sample,'minor',args.seq_length,table)
|
|
126 #printseq(sample,'minor',sequence,minor_out)
|
|
127 printseq(sample,'_minor',sequence,minor_out)
|
|
128 minor_out.close()
|
|
129
|
|
130 if __name__ == "__main__": main() |