Mercurial > repos > cafletezbrant > kmersvm
diff kmersvm/scripts/make_profile.py @ 0:7fe1103032f7 draft
Uploaded
author | cafletezbrant |
---|---|
date | Mon, 20 Aug 2012 18:07:22 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/scripts/make_profile.py Mon Aug 20 18:07:22 2012 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python2.7 + +import os +import os.path +import sys +import random +import optparse + +from bitarray import bitarray + +def read_bed_file(filename): + """ + """ + f = open(filename, 'r') + lines = f.readlines() + f.close() + + positions = [] + + for line in lines: + if line[0] == '#': + continue + + l = line.split() + + positions.append((l[0], int(l[1]), int(l[2]))) + + return positions + + +def bitarray_fromfile(filename): + """ + """ + fh = open(filename, 'rb') + bits = bitarray() + bits.fromfile(fh) + + return bits, fh + +def get_seqid(buildname, pos): + return '_'.join( [buildname, pos[0], str(pos[1]), str(pos[2]), '+'] ) + +def make_profile(positions, buildname, basedir): + """ + """ + chrnames = sorted(set(map(lambda p: p[0], positions))) + + profiles = {} + for chrom in chrnames: + idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) + idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) + + #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False: + # continue + bits_gc, gcf = bitarray_fromfile(idxf_gc) + bits_rpt, rptf = bitarray_fromfile(idxf_rpt) + + for pos in positions: + if pos[0] != chrom: + continue + + seqid = get_seqid(buildname, pos) + slen = pos[2]-pos[1] + gc = bits_gc[pos[1]:pos[2]].count(True) + rpt = bits_rpt[pos[1]:pos[2]].count(True) + + profiles[seqid] = (slen, gc, rpt) + + gcf.close() + rptf.close() + + return profiles + +def main(): + parser = optparse.OptionParser() + if len(sys.argv) != 5: + print "Usage:", sys.argv[0], "BEDFILE BUILDNAME BASE_DIR OUT_FILE" + sys.exit() + parser.add_option("-o", dest="output", default="seq_profile.txt") + (options,args) = parser.parse_args() + + bedfile = sys.argv[1] + buildname = sys.argv[2] + basedir = sys.argv[3] + output = options.output + + positions = read_bed_file(bedfile) + seqids = [] + for pos in positions: + seqids.append(get_seqid(buildname, pos)) + + profiles = make_profile(positions, buildname, basedir) + + f = open(output, 'w') + f.write("\t".join(["#seq_id", "length", "GC content", "repeat fraction"]) + '\n') + for seqid in seqids: + prof = profiles[seqid] + f.write('\t'.join( map(str, [seqid, prof[0], prof[1]/float(prof[0]), prof[2]/float(prof[0])]) ) + '\n') + f.close() + +if __name__ == "__main__": main()