4
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 from galaxy import eggs
|
|
5 import pkg_resources
|
|
6 pkg_resources.require( "bx-python" )
|
|
7 import bx.seq.twobit
|
|
8
|
|
9 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence
|
|
10
|
|
11 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
|
|
12 seq_path = sys.argv[2] #Path to the reference genome in 2bit format
|
|
13
|
|
14 ##maxTRlength=int(sys.argv[4])
|
|
15 ##maxoriginalreadlength=int(sys.argv[5])
|
|
16 maxTRlength=int(sys.argv[3])
|
|
17 maxoriginalreadlength=int(sys.argv[4])
|
|
18 outfile=sys.argv[5]
|
|
19 fout = open(outfile,'w')
|
|
20
|
|
21 twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
|
|
22
|
|
23 skipped=0
|
|
24 while True:
|
|
25 read = samf.readline().strip()
|
|
26 if not(read): #EOF reached
|
|
27 break
|
|
28 if read[0] == "@":
|
|
29 #print read
|
|
30 continue
|
|
31 mate = samf.readline().strip()
|
|
32 if not(mate): #EOF reached
|
|
33 break
|
|
34 read_elems = read.split()
|
|
35 mate_elems = mate.split()
|
|
36 read_name = read_elems[0].strip()
|
|
37 mate_name = mate_elems[0].strip()
|
|
38 while True:
|
|
39 if read_name == mate_name:
|
|
40 break
|
|
41 elif read_name != mate_name:
|
|
42 #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
|
|
43 #break
|
|
44 skipped += 1
|
|
45 read = mate
|
|
46 read_elems = mate_elems
|
|
47 mate = samf.readline().strip()
|
|
48 read_name = read_elems[0].strip()
|
|
49 mate_name = mate_elems[0].strip()
|
|
50 if not(mate): #EOF reached
|
|
51 break
|
|
52 mate_elems = mate.split()
|
|
53 #extract XT:A tag
|
|
54 #for e in read_elems:
|
|
55 # if e.startswith('XT:A'):
|
|
56 # read_xt = e
|
|
57 #for e in mate_elems:
|
|
58 # if e.startswith('XT:A'):
|
|
59 # mate_xt = e
|
|
60 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely
|
|
61 # continue
|
|
62 read_chr = read_elems[2]
|
|
63 read_start = int(read_elems[3])
|
|
64 read_cigar = read_elems[5]
|
|
65 if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
|
|
66 continue
|
|
67 read_len = int(read_cigar.split('M')[0])
|
|
68 mate_chr = mate_elems[2]
|
|
69 mate_start = int(mate_elems[3])
|
|
70 mate_cigar = mate_elems[5]
|
|
71 if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
|
|
72 continue
|
|
73 mate_len = int(mate_cigar.split('M')[0])
|
|
74 if read_chr != mate_chr: # check that they were mapped to the same chromosome
|
|
75 continue
|
|
76 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
|
|
77 continue
|
|
78 if read_start < mate_start:
|
|
79 pre_s = read_start-1
|
|
80 pre_e = read_start-1+read_len
|
|
81 tr_s = read_start-1+read_len
|
|
82 tr_e = mate_start-1
|
|
83 suf_s = mate_start-1
|
|
84 suf_e = mate_start-1+mate_len
|
|
85 else:
|
|
86 pre_s = mate_start-1
|
|
87 pre_e = mate_start-1+mate_len
|
|
88 tr_s = mate_start-1+mate_len
|
|
89 tr_e = read_start-1
|
|
90 suf_s = read_start-1
|
|
91 suf_e = read_start-1+read_len
|
|
92 tr_len = abs(tr_e - tr_s)
|
|
93 if tr_len > maxTRlength:
|
|
94 continue
|
|
95 if pre_e >= suf_s: #overlapping prefix and suffix
|
|
96 continue
|
|
97 tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
|
|
98 ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
|
|
99 fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
|
|
100
|
|
101 print "Skipped %d unpaired reads" %(skipped)
|