Mercurial > repos > xuebing > sharplabtool
view tools/regVariation/getIndelRates_3way.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python #Guruprasad Ananda from galaxy import eggs import pkg_resources pkg_resources.require( "bx-python" ) import sys, os, tempfile import traceback import fileinput from warnings import warn from galaxy.tools.util.galaxyops import * from bx.intervals.io import * from bx.intervals.operations import quicksect def stop_err(msg): sys.stderr.write(msg) sys.exit() def counter(node, start, end, sort_col): global full, blk_len, blk_list if node.start < start: if node.right: counter(node.right, start, end, sort_col) elif start <= node.start <= end and start <= node.end <= end: full += 1 if node.other[0] not in blk_list: blk_list.append(node.other[0]) blk_len += int(node.other[sort_col+2]) if node.left and node.left.maxend > start: counter(node.left, start, end, sort_col) if node.right: counter(node.right, start, end, sort_col) elif node.start > end: if node.left: counter(node.left, start, end, sort_col) infile = sys.argv[1] fout = open(sys.argv[2],'w') int_file = sys.argv[3] if int_file != "None": #User has specified an interval file try: fint = open(int_file, 'r') dbkey_i = sys.argv[4] chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) except: stop_err("Unable to open input Interval file") def main(): for i, line in enumerate( file ( infile )): line = line.rstrip('\r\n') if len( line )>0 and not line.startswith( '#' ): elems = line.split( '\t' ) break if i == 30: break # Hopefully we'll never get here... if len( elems ) != 18: stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) for i, line in enumerate( file ( infile )): line = line.rstrip('\r\n') elems = line.split('\t') try: assert int(elems[0]) assert len(elems) == 18 if int_file != "None": if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: stop_err("The species build corresponding to your interval file is not present in the Indel file.") if dbkey_i in elems[3]: sort_col = 4 elif dbkey_i in elems[8]: sort_col = 9 elif dbkey_i in elems[13]: sort_col = 14 else: species = [] species.append( elems[3].split('.')[0] ) species.append( elems[8].split('.')[0] ) species.append( elems[13].split('.')[0] ) sort_col = 0 #Based on block numbers break except: continue fin = open(infile, 'r') skipped = 0 if int_file == "None": sorted_infile = tempfile.NamedTemporaryFile() cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile try: os.system(cmdline) except: stop_err("Encountered error while sorting the input file.") print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) prev_bnum = -1 sorted_infile.seek(0) for line in sorted_infile.readlines(): line = line.rstrip('\r\n') elems = line.split('\t') try: assert int(elems[0]) assert len(elems) == 18 new_bnum = int(elems[0]) if new_bnum != prev_bnum: if prev_bnum != -1: irate = [] drate = [] for i,elem in enumerate(inserts): try: irate.append(str("%.2e" %(inserts[i]/blen[i]))) except: irate.append('0') try: drate.append(str("%.2e" %(deletes[i]/blen[i]))) except: drate.append('0') print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) inserts = [0.0, 0.0, 0.0] deletes = [0.0, 0.0, 0.0] blen = [] blen.append( int(elems[6]) ) blen.append( int(elems[11]) ) blen.append( int(elems[16]) ) line_sp = elems[1].split('.')[0] sp_ind = species.index(line_sp) if elems[1].endswith('insert'): inserts[sp_ind] += 1 elif elems[1].endswith('delete'): deletes[sp_ind] += 1 prev_bnum = new_bnum except Exception, ei: #print >>sys.stderr, ei continue irate = [] drate = [] for i,elem in enumerate(inserts): try: irate.append(str("%.2e" %(inserts[i]/blen[i]))) except: irate.append('0') try: drate.append(str("%.2e" %(deletes[i]/blen[i]))) except: drate.append('0') print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) sys.exit() inf = open(infile, 'r') start_met = False end_met = False sp_file = tempfile.NamedTemporaryFile() for n, line in enumerate(inf): line = line.rstrip('\r\n') elems = line.split('\t') try: assert int(elems[0]) assert len(elems) == 18 if dbkey_i not in elems[1]: if not(start_met): continue else: sp_end = n break else: print >>sp_file, line if not(start_met): start_met = True sp_start = n except: continue try: assert sp_end except: sp_end = n+1 sp_file.seek(0) win = NiceReaderWrapper( fileinput.FileInput( int_file ), chrom_col=chr_col_i, start_col=start_col_i, end_col=end_col_i, strand_col=strand_col_i, fix_strand=True) indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), chrom_col=1, start_col=sort_col, end_col=sort_col+1, strand_col=-1, fix_strand=True) indelTree = quicksect.IntervalTree() for item in indel: if type( item ) is GenomicInterval: indelTree.insert( item, indel.linenum, item.fields ) result=[] global full, blk_len, blk_list for interval in win: if type( interval ) is Header: pass if type( interval ) is Comment: pass elif type( interval ) == GenomicInterval: chrom = interval.chrom start = int(interval.start) end = int(interval.end) if start > end: warn( "Interval start after end!" ) ins_chr = "%s.%s_insert" %(dbkey_i,chrom) del_chr = "%s.%s_delete" %(dbkey_i,chrom) irate = 0 drate = 0 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: pass else: if ins_chr in indelTree.chroms: full = 0.0 blk_len = 0 blk_list = [] root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree counter(root, start, end, sort_col) if blk_len: irate = full/blk_len if del_chr in indelTree.chroms: full = 0.0 blk_len = 0 blk_list = [] root = indelTree.chroms[del_chr] #root node for the chrom insertion tree counter(root, start, end, sort_col) if blk_len: drate = full/blk_len interval.fields.append(str("%.2e" %irate)) interval.fields.append(str("%.2e" %drate)) print >>fout, "\t".join(interval.fields) fout.flush() if __name__ == "__main__": main()