view kmersvm/scripts/nullseq_build_indices.py @ 7:fd740d515502 draft default tip

Uploaded revised kmer-SVM to include modules from kmer-visual.
author cafletezbrant
date Sun, 16 Jun 2013 18:06:14 -0400
parents 7fe1103032f7
children
line wrap: on
line source

import os
import sys
import tarfile
import zipfile
import optparse

from bitarray import bitarray

def clear_indexes(sid, buildname):
	na = '.'.join([buildname, sid, "na", "out"])
	gc = '.'.join([buildname, sid, "gc", "out"])
	rpt = '.'.join([buildname, sid, "rpt", "out"])

	#truncate files
	for fn in (na, gc, rpt):
		f = open(fn, 'wb')
		f.close()


def append_indexes(seq, sid, buildname):
	na = '.'.join([buildname, sid, "na", "out"])
	gc = '.'.join([buildname, sid, "gc", "out"])
	rpt = '.'.join([buildname, sid, "rpt", "out"])

	f = open(na, 'ab')
	bitarray(map(lambda c: c in 'N', seq)).tofile(f)
	f.close()

	f = open(gc, 'ab')
	bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f)
	f.close()

	f = open(rpt, 'ab')
	bitarray(map(lambda c: c in 'acgt', seq)).tofile(f)
	f.close()


def build_indexes(fn, buildname):
	save_interval = 8*32*1024

	try:
		f = open(fn, 'r')
		
		seq = [] 
		sid = ''
		nlines = 0
		for line in f:
			if line[0] == '>':
				if sid: 
					append_indexes("".join(seq), sid, buildname)
					seq = []

				sid = line[1:].rstrip('\n').split()[0]
				clear_indexes(sid, buildname)
			else:
				nlines += 1
				seq.append(line.rstrip('\n'))

				if nlines % save_interval == 0:
					append_indexes("".join(seq), sid, buildname)
					seq = []

		#the last remaining sequence 
		append_indexes("".join(seq), sid, buildname)

	except IOError, (errno, strerror):
		print "I/O error(%d): %s" % (errno, strerror)
		sys.exit(0)


def main(argv=sys.argv):
	usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>"
	desc  = "generate bit index files for generating null sequences"

	parser = optparse.OptionParser(usage=usage, description=desc)

	parser.add_option("-q", dest="quiet", default=False, action="store_true", \
  			help="supress messages (default=false)")

	(options, args) = parser.parse_args()

	if len(args) == 0:
		parser.print_help()
		sys.exit(0)

	if len(args) != 2:
		parser.error("incorrect number of arguments")
		parser.print_help()
		sys.exit(0)

	chrom_file = args[0]
	genome = args[1]

	if zipfile.is_zipfile(chrom_file):
		if options.quiet == False:
			sys.stderr.write("detected file type is zip.\n")

		zipfileobj = zipfile.ZipFile(chrom_file)

		for fn in zipfileobj.namelist():
			if options.quiet == False:
				sys.stderr.write(' '.join(["processing", fn, "\n"]))

			zipfileobj.extract(fn)
			build_indexes(fn, genome)
			os.remove(fn)

		zipfileobj.close()

	elif tarfile.is_tarfile(chrom_file):
		if options.quiet == False:
			sys.stderr.write("detected file type is tar.\n")

		tarfileobj = tarfile.open(chrom_file)

		for fn in tarfileobj.getnames():
			if options.quiet == False:
				sys.stderr.write(' '.join(["processing", fn, "\n"]))

			tarfileobj.extract(fn)
			build_indexes(fn, genome)
			os.remove(fn)

		tarfileobj.close()

	else:
		sys.stderr.write(' '.join(["unknown input file:", fn, "\n"]))

if __name__ == "__main__": main()