getalleleseq: getalleleseq.py comparison

comparison getalleleseq.py @ 0:c542b3075f29 draft

Uploaded repo.tar.gz

author	boris
date	Mon, 03 Feb 2014 13:07:13 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:c542b3075f29
+#!/usr/bin/env python
+# Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
+#
+#usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles
+#
+#Given a table with minor and major alleles per position, it generates the
+#minor and major allele sequences in FASTA format
+#
+#positional arguments:
+#  alleles               Table containing minor and major allele base per
+#                        position. cols: [id, chr, pos, A, C, G, T, cvrg,
+#                        plody, major, minor, freq_minor]
+#
+#optional arguments:
+#  -h, --help            show this help message and exit
+#  -l INT, --seq-length INT
+#                        Background sequence length. Bases in an artifical
+#                        all-N-sequence of length INT will be replaced by
+#                        either the major or minor allele base accordingly
+#  -j FILE, --major-seq FILE
+#                        File to write major allele sequences in FASTA multiple
+#                        alignment format.
+#  -d DIR, --minor-dir DIR
+#                        Per sample minor allele sequences will be written to
+#                        this directory
+#
+# The expected columns in the alleles table follow Nicholas Stoler's
+# Variant Annotator tool format.  See Variant Annotator in Galaxy's tool shed
+# http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details
+#
+# Expected columns:
+# 1.  sample_id
+# 2.  chr
+# 3.  position
+# 4   counts for A's
+# 5.  counts for C's
+# 6.  counts for G's
+# 7.  counts for T's
+# 8.  Coverage
+# 9.  Number of alleles passing a given criteria
+# 10. Major allele
+# 11. Minor allele
+# 12. Minor allele frequency in position
+import sys
+import os
+import argparse
+def createseq(sample, allele, seq_size, table):
+"""Generate major or minor allele sequence"""
+out_sequence = ['N' for i in range(seq_size)]
+sample_data  = [line for line in table if line[0] == sample]
+for entry in sample_data:
+position = int(entry[2])
+number_of_alleles = int(entry[8])
+major_allele = entry[9].strip()
+minor_allele = entry[10].strip()
+if allele == 'major':
+out_sequence[position-1] = major_allele
+elif allele == 'minor':
+if number_of_alleles == 2:
+out_sequence[position-1] = minor_allele
+else:
+out_sequence[position-1] = major_allele
+return out_sequence
+def printseq(sample,allele,seq,output):
+"""Print out sequence"""
+#print >> output, '>{0}_{1}'.format(sample,allele)
+print >> output, '>{0}{1}'.format(sample,allele)
+for i in range(0,len(seq),70):
+print >> output, ''.join(seq[i:i+70])
+def main():
+parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
+parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ')
+parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly')
+parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.')
+parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)")
+parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility
+args = parser.parse_args()
+try:
+table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line]
+samples = sorted(list(set([ line[0] for line in table ])))
+except:
+sys.exit('\nERROR: Could not open %s\n' % args.alleles)
+try:
+major_out = open(args.major_seq, 'w+')
+except:
+sys.exit('\nCould not create %s\n' % args.major_seq)
+# Single file for all major allele sequences in FASTA multiple alignment
+for sample in samples:
+sequence = createseq(sample,'major',args.seq_length,table)
+#printseq(sample,'major',sequence,major_out)
+printseq(sample,'',sequence,major_out)
+major_out.close()
+# Sample specific minor allele sequence in FASTA format
+try:
+os.makedirs(args.minor_dir)
+except:
+pass
+for sample in samples:
+if args.minor_prefix: # to fit Galaxy requirements
+name = sample.replace('_','')
+minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta')
+else: # for non-Galaxy
+minor_name = sample+'-minor.fa'
+minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+')
+sequence = createseq(sample,'minor',args.seq_length,table)
+#printseq(sample,'minor',sequence,minor_out)
+printseq(sample,'_minor',sequence,minor_out)
+minor_out.close()
+if __name__ == "__main__": main()

Mercurial > repos > boris > getalleleseq

comparison getalleleseq.py @ 0:c542b3075f29 draft