Mercurial > repos > arkarachai-fungtammasan > str_fm
comparison PEsortedSAM2readprofile.py @ 0:07588b899c13 draft
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Wed, 01 Apr 2015 17:05:51 -0400 |
parents | |
children | a3113043abb0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:07588b899c13 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 from galaxy import eggs | |
5 import pkg_resources | |
6 pkg_resources.require( "bx-python" ) | |
7 import bx.seq.twobit | |
8 | |
9 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence | |
10 | |
11 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname | |
12 seq_path = sys.argv[2] #Path to the reference genome in 2bit format | |
13 | |
14 ##maxTRlength=int(sys.argv[4]) | |
15 ##maxoriginalreadlength=int(sys.argv[5]) | |
16 maxTRlength=int(sys.argv[3]) | |
17 maxoriginalreadlength=int(sys.argv[4]) | |
18 outfile=sys.argv[5] | |
19 fout = open(outfile,'w') | |
20 | |
21 twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) ) | |
22 | |
23 skipped=0 | |
24 while True: | |
25 read = samf.readline().strip() | |
26 if not(read): #EOF reached | |
27 break | |
28 if read[0] == "@": | |
29 #print read | |
30 continue | |
31 mate = samf.readline().strip() | |
32 if not(mate): #EOF reached | |
33 break | |
34 read_elems = read.split() | |
35 mate_elems = mate.split() | |
36 read_name = read_elems[0].strip() | |
37 mate_name = mate_elems[0].strip() | |
38 while True: | |
39 if read_name == mate_name: | |
40 break | |
41 elif read_name != mate_name: | |
42 #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry." | |
43 #break | |
44 skipped += 1 | |
45 read = mate | |
46 read_elems = mate_elems | |
47 mate = samf.readline().strip() | |
48 read_name = read_elems[0].strip() | |
49 mate_name = mate_elems[0].strip() | |
50 if not(mate): #EOF reached | |
51 break | |
52 mate_elems = mate.split() | |
53 #extract XT:A tag | |
54 #for e in read_elems: | |
55 # if e.startswith('XT:A'): | |
56 # read_xt = e | |
57 #for e in mate_elems: | |
58 # if e.startswith('XT:A'): | |
59 # mate_xt = e | |
60 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely | |
61 # continue | |
62 read_chr = read_elems[2] | |
63 read_start = int(read_elems[3]) | |
64 read_cigar = read_elems[5] | |
65 if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M | |
66 continue | |
67 read_len = int(read_cigar.split('M')[0]) | |
68 mate_chr = mate_elems[2] | |
69 mate_start = int(mate_elems[3]) | |
70 mate_cigar = mate_elems[5] | |
71 if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M | |
72 continue | |
73 mate_len = int(mate_cigar.split('M')[0]) | |
74 if read_chr != mate_chr: # check that they were mapped to the same chromosome | |
75 continue | |
76 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): | |
77 continue | |
78 if read_start < mate_start: | |
79 pre_s = read_start-1 | |
80 pre_e = read_start-1+read_len | |
81 tr_s = read_start-1+read_len | |
82 tr_e = mate_start-1 | |
83 suf_s = mate_start-1 | |
84 suf_e = mate_start-1+mate_len | |
85 else: | |
86 pre_s = mate_start-1 | |
87 pre_e = mate_start-1+mate_len | |
88 tr_s = mate_start-1+mate_len | |
89 tr_e = read_start-1 | |
90 suf_s = read_start-1 | |
91 suf_e = read_start-1+read_len | |
92 tr_len = abs(tr_e - tr_s) | |
93 if tr_len > maxTRlength: | |
94 continue | |
95 if pre_e >= suf_s: #overlapping prefix and suffix | |
96 continue | |
97 tr_ref_seq = twobitfile[read_chr][tr_s:tr_e] | |
98 ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq) | |
99 fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n') | |
100 | |
101 print "Skipped %d unpaired reads" %(skipped) |