annotate PEsortedSAM2readprofile.py @ 14:20dc70f85ff7 draft default tip

Uploaded
author arkarachai-fungtammasan
date Sun, 24 Jul 2016 18:15:32 -0400
parents a3113043abb0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
1 #!/usr/bin/env python
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
2
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
3 import sys
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
4 from galaxy import eggs
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
5 import pkg_resources
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
6 pkg_resources.require( "bx-python" )
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
7 import bx.seq.twobit
12
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
8 import re
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
9
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
10 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
11
12
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
12 def readlengthinfer(sequence):
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
13 total_readlength=0
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
14 countlist=['M','I','S','X','=']
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
15 not_countlist=['D','P','N','H']
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
16 matchs=re.findall(r"(\d+)([A-Z=])",sequence)
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
17 for match in matchs:
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
18 if match[1] in countlist:
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
19 total_readlength+=int(match[0])
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
20 return total_readlength
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
21
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
22 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
23 seq_path = sys.argv[2] #Path to the reference genome in 2bit format
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
24
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
25 ##maxTRlength=int(sys.argv[4])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
26 ##maxoriginalreadlength=int(sys.argv[5])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
27 maxTRlength=int(sys.argv[3])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
28 maxoriginalreadlength=int(sys.argv[4])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
29 outfile=sys.argv[5]
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
30 fout = open(outfile,'w')
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
31
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
32 twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
33
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
34 skipped=0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
35 while True:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
36 read = samf.readline().strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
37 if not(read): #EOF reached
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
38 break
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
39 if read[0] == "@":
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
40 #print read
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
41 continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
42 mate = samf.readline().strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
43 if not(mate): #EOF reached
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
44 break
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
45 read_elems = read.split()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
46 mate_elems = mate.split()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
47 read_name = read_elems[0].strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
48 mate_name = mate_elems[0].strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
49 while True:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
50 if read_name == mate_name:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
51 break
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
52 elif read_name != mate_name:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
53 #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
54 #break
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
55 skipped += 1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
56 read = mate
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
57 read_elems = mate_elems
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
58 mate = samf.readline().strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
59 read_name = read_elems[0].strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
60 mate_name = mate_elems[0].strip()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
61 if not(mate): #EOF reached
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
62 break
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
63 mate_elems = mate.split()
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
64 #extract XT:A tag
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
65 #for e in read_elems:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
66 # if e.startswith('XT:A'):
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
67 # read_xt = e
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
68 #for e in mate_elems:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
69 # if e.startswith('XT:A'):
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
70 # mate_xt = e
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
71 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
72 # continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
73 read_chr = read_elems[2]
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
74 read_start = int(read_elems[3])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
75 read_cigar = read_elems[5]
12
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
76 # if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
77 # continue
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
78 # read_len = int(read_cigar.split('M')[0])
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
79 read_len=readlengthinfer(read_cigar)
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
80 mate_chr = mate_elems[2]
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
81 mate_start = int(mate_elems[3])
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
82 mate_cigar = mate_elems[5]
12
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
83 # if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
84 # continue
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
85 # mate_len = int(mate_cigar.split('M')[0])
a3113043abb0 Uploaded
arkarachai-fungtammasan
parents: 0
diff changeset
86 mate_len=readlengthinfer(mate_cigar)
0
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
87 if read_chr != mate_chr: # check that they were mapped to the same chromosome
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
88 continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
89 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
90 continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
91 if read_start < mate_start:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
92 pre_s = read_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
93 pre_e = read_start-1+read_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
94 tr_s = read_start-1+read_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
95 tr_e = mate_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
96 suf_s = mate_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
97 suf_e = mate_start-1+mate_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
98 else:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
99 pre_s = mate_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
100 pre_e = mate_start-1+mate_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
101 tr_s = mate_start-1+mate_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
102 tr_e = read_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
103 suf_s = read_start-1
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
104 suf_e = read_start-1+read_len
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
105 tr_len = abs(tr_e - tr_s)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
106 if tr_len > maxTRlength:
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
107 continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
108 if pre_e >= suf_s: #overlapping prefix and suffix
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
109 continue
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
110 tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
111 ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
112 fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
113
07588b899c13 Uploaded
arkarachai-fungtammasan
parents:
diff changeset
114 print "Skipped %d unpaired reads" %(skipped)