Mercurial > repos > boris > getalleleseq
view getalleleseq.py @ 8:698ede7baba9 draft
Uploaded
author | boris |
---|---|
date | Tue, 18 Mar 2014 12:25:24 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) # #usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles # #Given a table with minor and major alleles per position, it generates the #minor and major allele sequences in FASTA format # #positional arguments: # alleles Table containing minor and major allele base per # position. cols: [id, chr, pos, A, C, G, T, cvrg, # plody, major, minor, freq_minor] # #optional arguments: # -h, --help show this help message and exit # -l INT, --seq-length INT # Background sequence length. Bases in an artifical # all-N-sequence of length INT will be replaced by # either the major or minor allele base accordingly # -j FILE, --major-seq FILE # File to write major allele sequences in FASTA multiple # alignment format. # -d DIR, --minor-dir DIR # Per sample minor allele sequences will be written to # this directory # # The expected columns in the alleles table follow Nicholas Stoler's # Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed # http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details # # Expected columns: # 1. sample_id # 2. chr # 3. position # 4 counts for A's # 5. counts for C's # 6. counts for G's # 7. counts for T's # (8. counts for a's) # (9. counts for c's) # (10. counts for g's) # (11. counts for t's) # 8. (12.) Coverage # 9. (13.) Number of alleles passing a given criteria # 10. (14.) Major allele # 11. (15.) Minor allele # 12. (16.) Minor allele frequency in position import sys import os import argparse def createseq(sample, allele, seq_size, table): """Generate major or minor allele sequence""" out_sequence = ['N' for i in range(seq_size)] sample_data = [line for line in table if line[0] == sample] for entry in sample_data: position = int(entry[2]) if len(entry)==12: number_of_alleles = int(entry[8]) major_allele = entry[9].strip() minor_allele = entry[10].strip() else: number_of_alleles = int(entry[12]) major_allele = entry[13].strip() minor_allele = entry[14].strip() if allele == 'major': out_sequence[position-1] = major_allele elif allele == 'minor': if number_of_alleles >= 2: out_sequence[position-1] = minor_allele else: out_sequence[position-1] = major_allele return out_sequence def printseq(sample,allele,seq,output): """Print out sequence""" #print >> output, '>{0}_{1}'.format(sample,allele) print >> output, '>{0}{1}'.format(sample,allele) for i in range(0,len(seq),70): print >> output, ''.join(seq[i:i+70]) def main(): parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility args = parser.parse_args() try: table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] samples = sorted(list(set([ line[0] for line in table ]))) except: sys.exit('\nERROR: Could not open %s\n' % args.alleles) try: major_out = open(args.major_seq, 'w+') except: sys.exit('\nCould not create %s\n' % args.major_seq) # Single file for all major allele sequences in FASTA multiple alignment for sample in samples: sequence = createseq(sample,'major',args.seq_length,table) #printseq(sample,'major',sequence,major_out) printseq(sample,'',sequence,major_out) major_out.close() # Sample specific minor allele sequence in FASTA format try: os.makedirs(args.minor_dir) except: pass for sample in samples: if args.minor_prefix: # to fit Galaxy requirements name = sample.replace('_','') minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') else: # for non-Galaxy minor_name = sample+'-minor.fa' minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') sequence = createseq(sample,'minor',args.seq_length,table) #printseq(sample,'minor',sequence,minor_out) printseq(sample,'_minor',sequence,minor_out) minor_out.close() if __name__ == "__main__": main()