Mercurial > repos > arkarachai-fungtammasan > str_fm
comparison PEsortedSAM2readprofile.py @ 12:a3113043abb0 draft
Uploaded
author | arkarachai-fungtammasan |
---|---|
date | Sun, 24 Jul 2016 17:55:50 -0400 |
parents | 07588b899c13 |
children |
comparison
equal
deleted
inserted
replaced
11:48b5f719e36a | 12:a3113043abb0 |
---|---|
3 import sys | 3 import sys |
4 from galaxy import eggs | 4 from galaxy import eggs |
5 import pkg_resources | 5 import pkg_resources |
6 pkg_resources.require( "bx-python" ) | 6 pkg_resources.require( "bx-python" ) |
7 import bx.seq.twobit | 7 import bx.seq.twobit |
8 import re | |
8 | 9 |
9 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence | 10 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence |
10 | 11 |
12 def readlengthinfer(sequence): | |
13 total_readlength=0 | |
14 countlist=['M','I','S','X','='] | |
15 not_countlist=['D','P','N','H'] | |
16 matchs=re.findall(r"(\d+)([A-Z=])",sequence) | |
17 for match in matchs: | |
18 if match[1] in countlist: | |
19 total_readlength+=int(match[0]) | |
20 return total_readlength | |
21 | |
11 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname | 22 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname |
12 seq_path = sys.argv[2] #Path to the reference genome in 2bit format | 23 seq_path = sys.argv[2] #Path to the reference genome in 2bit format |
13 | 24 |
14 ##maxTRlength=int(sys.argv[4]) | 25 ##maxTRlength=int(sys.argv[4]) |
15 ##maxoriginalreadlength=int(sys.argv[5]) | 26 ##maxoriginalreadlength=int(sys.argv[5]) |
60 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely | 71 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely |
61 # continue | 72 # continue |
62 read_chr = read_elems[2] | 73 read_chr = read_elems[2] |
63 read_start = int(read_elems[3]) | 74 read_start = int(read_elems[3]) |
64 read_cigar = read_elems[5] | 75 read_cigar = read_elems[5] |
65 if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M | 76 # if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M |
66 continue | 77 # continue |
67 read_len = int(read_cigar.split('M')[0]) | 78 # read_len = int(read_cigar.split('M')[0]) |
79 read_len=readlengthinfer(read_cigar) | |
68 mate_chr = mate_elems[2] | 80 mate_chr = mate_elems[2] |
69 mate_start = int(mate_elems[3]) | 81 mate_start = int(mate_elems[3]) |
70 mate_cigar = mate_elems[5] | 82 mate_cigar = mate_elems[5] |
71 if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M | 83 # if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M |
72 continue | 84 # continue |
73 mate_len = int(mate_cigar.split('M')[0]) | 85 # mate_len = int(mate_cigar.split('M')[0]) |
86 mate_len=readlengthinfer(mate_cigar) | |
74 if read_chr != mate_chr: # check that they were mapped to the same chromosome | 87 if read_chr != mate_chr: # check that they were mapped to the same chromosome |
75 continue | 88 continue |
76 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): | 89 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): |
77 continue | 90 continue |
78 if read_start < mate_start: | 91 if read_start < mate_start: |