Mercurial > repos > boris > getalleleseq
diff getalleleseq.py @ 0:c542b3075f29 draft
Uploaded repo.tar.gz
author | boris |
---|---|
date | Mon, 03 Feb 2014 13:07:13 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getalleleseq.py Mon Feb 03 13:07:13 2014 -0500 @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) +# +#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles +# +#Given a table with minor and major alleles per position, it generates the +#minor and major allele sequences in FASTA format +# +#positional arguments: +# alleles Table containing minor and major allele base per +# position. cols: [id, chr, pos, A, C, G, T, cvrg, +# plody, major, minor, freq_minor] +# +#optional arguments: +# -h, --help show this help message and exit +# -l INT, --seq-length INT +# Background sequence length. Bases in an artifical +# all-N-sequence of length INT will be replaced by +# either the major or minor allele base accordingly +# -j FILE, --major-seq FILE +# File to write major allele sequences in FASTA multiple +# alignment format. +# -d DIR, --minor-dir DIR +# Per sample minor allele sequences will be written to +# this directory +# +# The expected columns in the alleles table follow Nicholas Stoler's +# Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed +# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details +# +# Expected columns: +# 1. sample_id +# 2. chr +# 3. position +# 4 counts for A's +# 5. counts for C's +# 6. counts for G's +# 7. counts for T's +# 8. Coverage +# 9. Number of alleles passing a given criteria +# 10. Major allele +# 11. Minor allele +# 12. Minor allele frequency in position + +import sys +import os +import argparse + +def createseq(sample, allele, seq_size, table): + """Generate major or minor allele sequence""" + out_sequence = ['N' for i in range(seq_size)] + sample_data = [line for line in table if line[0] == sample] + + for entry in sample_data: + position = int(entry[2]) + number_of_alleles = int(entry[8]) + major_allele = entry[9].strip() + minor_allele = entry[10].strip() + + if allele == 'major': + out_sequence[position-1] = major_allele + elif allele == 'minor': + if number_of_alleles == 2: + out_sequence[position-1] = minor_allele + else: + out_sequence[position-1] = major_allele + return out_sequence + +def printseq(sample,allele,seq,output): + """Print out sequence""" + #print >> output, '>{0}_{1}'.format(sample,allele) + print >> output, '>{0}{1}'.format(sample,allele) + for i in range(0,len(seq),70): + print >> output, ''.join(seq[i:i+70]) + +def main(): + parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') + parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') + parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') + parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') + parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") + parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility + args = parser.parse_args() + + + try: + table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] + samples = sorted(list(set([ line[0] for line in table ]))) + except: + sys.exit('\nERROR: Could not open %s\n' % args.alleles) + try: + major_out = open(args.major_seq, 'w+') + except: + sys.exit('\nCould not create %s\n' % args.major_seq) + + # Single file for all major allele sequences in FASTA multiple alignment + for sample in samples: + sequence = createseq(sample,'major',args.seq_length,table) + #printseq(sample,'major',sequence,major_out) + printseq(sample,'',sequence,major_out) + major_out.close() + + # Sample specific minor allele sequence in FASTA format + try: + os.makedirs(args.minor_dir) + except: + pass + + for sample in samples: + if args.minor_prefix: # to fit Galaxy requirements + name = sample.replace('_','') + minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') + else: # for non-Galaxy + minor_name = sample+'-minor.fa' + minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') + sequence = createseq(sample,'minor',args.seq_length,table) + #printseq(sample,'minor',sequence,minor_out) + printseq(sample,'_minor',sequence,minor_out) + minor_out.close() + +if __name__ == "__main__": main() \ No newline at end of file