Mercurial > repos > arkarachai-fungtammasan > str_fm
diff PEsortedSAM2readprofile.py @ 0:07588b899c13 draft
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Wed, 01 Apr 2015 17:05:51 -0400 |
parents | |
children | a3113043abb0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PEsortedSAM2readprofile.py Wed Apr 01 17:05:51 2015 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +import sys +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) +import bx.seq.twobit + +##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence + +samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname +seq_path = sys.argv[2] #Path to the reference genome in 2bit format + +##maxTRlength=int(sys.argv[4]) +##maxoriginalreadlength=int(sys.argv[5]) +maxTRlength=int(sys.argv[3]) +maxoriginalreadlength=int(sys.argv[4]) +outfile=sys.argv[5] +fout = open(outfile,'w') + +twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) ) + +skipped=0 +while True: + read = samf.readline().strip() + if not(read): #EOF reached + break + if read[0] == "@": + #print read + continue + mate = samf.readline().strip() + if not(mate): #EOF reached + break + read_elems = read.split() + mate_elems = mate.split() + read_name = read_elems[0].strip() + mate_name = mate_elems[0].strip() + while True: + if read_name == mate_name: + break + elif read_name != mate_name: + #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry." + #break + skipped += 1 + read = mate + read_elems = mate_elems + mate = samf.readline().strip() + read_name = read_elems[0].strip() + mate_name = mate_elems[0].strip() + if not(mate): #EOF reached + break + mate_elems = mate.split() + #extract XT:A tag + #for e in read_elems: + # if e.startswith('XT:A'): + # read_xt = e + #for e in mate_elems: + # if e.startswith('XT:A'): + # mate_xt = e + #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely + # continue + read_chr = read_elems[2] + read_start = int(read_elems[3]) + read_cigar = read_elems[5] + if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M + continue + read_len = int(read_cigar.split('M')[0]) + mate_chr = mate_elems[2] + mate_start = int(mate_elems[3]) + mate_cigar = mate_elems[5] + if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M + continue + mate_len = int(mate_cigar.split('M')[0]) + if read_chr != mate_chr: # check that they were mapped to the same chromosome + continue + if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): + continue + if read_start < mate_start: + pre_s = read_start-1 + pre_e = read_start-1+read_len + tr_s = read_start-1+read_len + tr_e = mate_start-1 + suf_s = mate_start-1 + suf_e = mate_start-1+mate_len + else: + pre_s = mate_start-1 + pre_e = mate_start-1+mate_len + tr_s = mate_start-1+mate_len + tr_e = read_start-1 + suf_s = read_start-1 + suf_e = read_start-1+read_len + tr_len = abs(tr_e - tr_s) + if tr_len > maxTRlength: + continue + if pre_e >= suf_s: #overlapping prefix and suffix + continue + tr_ref_seq = twobitfile[read_chr][tr_s:tr_e] + ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) + fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n') + +print "Skipped %d unpaired reads" %(skipped)