annotate microsats_alignment_level.py @ 2:ad471b193191

Fix blank results when running on NFS.
author Dave Bouvier <dave@bx.psu.edu>
date Fri, 16 Jan 2015 14:51:32 -0500
parents d4368a5a3fc7
children df7548445f4e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
2 #Guruprasad Ananda
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
3 """
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
4 Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output.
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
5 """
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
6 import os
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
7 import re
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
8 import string
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
9 import sys
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
10 import tempfile
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
11
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
12 def reverse_complement(text):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
13 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
14 comp = [ch for ch in text.translate(DNA_COMP)]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
15 comp.reverse()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
16 return "".join(comp)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
17
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
18
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
19 def main():
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
20 if len(sys.argv) != 8:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
21 print >> sys.stderr, "Insufficient number of arguments."
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
22 sys.exit()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
23
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
24 infile = open(sys.argv[1],'r')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
25 separation = int(sys.argv[2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
26 outfile = sys.argv[3]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
27 mono_threshold = int(sys.argv[5])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
28 non_mono_threshold = int(sys.argv[6])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
29 allow_different_units = int(sys.argv[7])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
30
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
31 print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" % ( separation, mono_threshold, non_mono_threshold, allow_different_units==1 )
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
32 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
33 fout = open(outfile, "w")
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
34 print >> fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit"
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
35 #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik")
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
36 sputnik_cmd = "sputnik"
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
37 input = infile.read()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
38 block_num = 0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
39 input = input.replace('\r','\n')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
40 for block in input.split('\n\n'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
41 block_num += 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
42 tmpin = tempfile.NamedTemporaryFile()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
43 tmpout = tempfile.NamedTemporaryFile()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
44 tmpin.write(block.strip())
2
ad471b193191 Fix blank results when running on NFS.
Dave Bouvier <dave@bx.psu.edu>
parents: 0
diff changeset
45 tmpin.flush()
0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
46 cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
47 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
48 os.system(cmdline)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
49 except Exception:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
50 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
51 sputnik_out = tmpout.read()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
52 tmpin.close()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
53 tmpout.close()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
54 if sputnik_out != "":
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
55 if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
56 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
57 align_block = block.strip().split('>')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
58
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
59 lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6}
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
60 blockdict = {}
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
61 r = 0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
62 namelist = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
63 for k, sput_block in enumerate(sputnik_out.split('>')[1:]):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
64 whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
65 p = re.compile('\n(\S*nucleotide)')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
66 repeats = p.split(sput_block.strip())
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
67 repeats_count = len(repeats)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
68 j = 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
69 name = repeats[0].strip()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
70 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
71 coords = re.search('\d+[-_:]\d+', name).group()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
72 coords = coords.replace('_', '-').replace(':', '-')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
73 except Exception:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
74 coords = '0-0'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
75 r += 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
76 blockdict[r] = {}
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
77 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
78 sp_name = name[:name.index('.')]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
79 chr_name = name[name.index('.'):name.index('(')]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
80 namelist.append(sp_name + chr_name)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
81 except:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
82 namelist.append(name[:20])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
83 while j < repeats_count:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
84 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
85 if repeats[j].strip() not in lendict:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
86 j += 2
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
87 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
88
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
89 if blockdict[r].has_key('types'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
90 blockdict[r]['types'].append(repeats[j].strip()) #type of microsat
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
91 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
92 blockdict[r]['types'] = [repeats[j].strip()] #type of microsat
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
93
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
94 start = int(repeats[j+1].split('--')[0].split(':')[0].strip())
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
95 #check to see if there are gaps before the start of the repeat, and change the start accordingly
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
96 sgaps = 0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
97 ch_pos = start - 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
98 while ch_pos >= 0:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
99 if whole_seq[ch_pos] == '-':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
100 sgaps += 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
101 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
102 break #break at the 1st non-gap character
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
103 ch_pos -= 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
104 if blockdict[r].has_key('starts'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
105 blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
106 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
107 blockdict[r]['starts'] = [start+sgaps]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
108
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
109 end = int(repeats[j+1].split('--')[0].split(':')[1].strip())
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
110 #check to see if there are gaps after the end of the repeat, and change the end accordingly
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
111 egaps = 0
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
112 for ch in whole_seq[end:]:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
113 if ch == '-':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
114 egaps += 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
115 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
116 break #break at the 1st non-gap character
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
117 if blockdict[r].has_key('ends'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
118 blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
119 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
120 blockdict[r]['ends'] = [end+egaps]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
121
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
122 repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
123 repeat_len = repeats[j+1].split('--')[1].split()[1].strip()
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
124 gap_count = repeat_seq.count('-')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
125 #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
126 repeat_len = str(int(repeat_len) - gap_count)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
127
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
128 rel_start = blockdict[r]['starts'][-1]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
129 gaps_before_start = whole_seq[:rel_start].count('-')
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
130
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
131 if blockdict[r].has_key('gaps_before_start'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
132 blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
133 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
134 blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
135
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
136 whole_seq_start = int(coords.split('-')[0])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
137 if blockdict[r].has_key('whole_seq_start'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
138 blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
139 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
140 blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
141
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
142 if blockdict[r].has_key('lengths'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
143 blockdict[r]['lengths'].append(repeat_len) #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
144 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
145 blockdict[r]['lengths'] = [repeat_len] #lengths
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
146
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
147 if blockdict[r].has_key('counts'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
148 blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
149 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
150 blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
151
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
152 if blockdict[r].has_key('units'):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
153 blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
154 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
155 blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
156
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
157 except Exception:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
158 pass
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
159 j += 2
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
160 #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'.
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
161 delete_index_list = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
162 for ind, item in enumerate(blockdict[r]['ends']):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
163 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
164 if blockdict[r]['starts'][ind+1]-item < separation:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
165 if ind not in delete_index_list:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
166 delete_index_list.append(ind)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
167 if ind+1 not in delete_index_list:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
168 delete_index_list.append(ind+1)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
169 except Exception:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
170 pass
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
171 for index in delete_index_list: #mark them for deletion
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
172 try:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
173 blockdict[r]['starts'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
174 blockdict[r]['ends'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
175 blockdict[r]['types'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
176 blockdict[r]['gaps_before_start'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
177 blockdict[r]['whole_seq_start'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
178 blockdict[r]['lengths'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
179 blockdict[r]['counts'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
180 blockdict[r]['units'][index] = 'marked'
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
181 except Exception:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
182 pass
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
183 #remove 'marked' elements from all the lists
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
184 """
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
185 for key in blockdict[r].keys():
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
186 for elem in blockdict[r][key]:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
187 if elem == 'marked':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
188 blockdict[r][key].remove(elem)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
189 """
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
190 #print blockdict
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
191
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
192 #make sure that the blockdict has keys for both the species
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
193 if (1 not in blockdict) or (2 not in blockdict):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
194 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
195
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
196 visited_2 = [0 for x in range(len(blockdict[2]['starts']))]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
197 for ind1, coord_s1 in enumerate(blockdict[1]['starts']):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
198 if coord_s1 == 'marked':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
199 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
200 coord_e1 = blockdict[1]['ends'][ind1]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
201 out = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
202 for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
203 if coord_s2 == 'marked':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
204 visited_2[ind2] = 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
205 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
206 coord_e2 = blockdict[2]['ends'][ind2]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
207 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
208 if allow_different_units == 0:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
209 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
210 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
211 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
212 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
213 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
214 #print >> sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
215 #skip if the repeat number thresholds are not met
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
216 if blockdict[1]['types'][ind1] == 'mononucleotide':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
217 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
218 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
219 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
220 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
221 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
222
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
223 if blockdict[2]['types'][ind2] == 'mononucleotide':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
224 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
225 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
226 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
227 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
228 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
229 #print "s1,e1=%s,%s; s2,e2=%s,%s" % ( coord_s1, coord_e1, coord_s2, coord_e2 )
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
230 if (coord_s1 in range(coord_s2, coord_e2)) or (coord_e1 in range(coord_s2, coord_e2)):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
231 out.append(str(block_num))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
232 out.append(namelist[0])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
233 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
234 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
235 out.append(str(rel_start))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
236 out.append(str(rel_end))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
237 out.append(blockdict[1]['types'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
238 out.append(blockdict[1]['lengths'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
239 out.append(blockdict[1]['counts'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
240 out.append(blockdict[1]['units'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
241 out.append(namelist[1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
242 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
243 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
244 out.append(str(rel_start))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
245 out.append(str(rel_end))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
246 out.append(blockdict[2]['types'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
247 out.append(blockdict[2]['lengths'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
248 out.append(blockdict[2]['counts'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
249 out.append(blockdict[2]['units'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
250 print >> fout, '\t'.join(out)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
251 visited_2[ind2] = 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
252 out = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
253
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
254 if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet.
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
255 for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
256 if coord_s2 == 'marked':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
257 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
258 if visited_2[ind] != 0:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
259 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
260 coord_e2 = blockdict[2]['ends'][ind2]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
261 out = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
262 for ind1, coord_s1 in enumerate(blockdict[1]['starts']):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
263 if coord_s1 == 'marked':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
264 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
265 coord_e1 = blockdict[1]['ends'][ind1]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
266 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
267 if allow_different_units == 0:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
268 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
269 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
270 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
271 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
272 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
273 #skip if the repeat number thresholds are not met
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
274 if blockdict[1]['types'][ind1] == 'mononucleotide':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
275 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
276 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
277 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
278 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
279 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
280
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
281 if blockdict[2]['types'][ind2] == 'mononucleotide':
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
282 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
283 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
284 else:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
285 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
286 continue
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
287
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
288 if (coord_s2 in range(coord_s1, coord_e1)) or (coord_e2 in range(coord_s1, coord_e1)):
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
289 out.append(str(block_num))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
290 out.append(namelist[0])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
291 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
292 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
293 out.append(str(rel_start))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
294 out.append(str(rel_end))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
295 out.append(blockdict[1]['types'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
296 out.append(blockdict[1]['lengths'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
297 out.append(blockdict[1]['counts'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
298 out.append(blockdict[1]['units'][ind1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
299 out.append(namelist[1])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
300 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
301 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
302 out.append(str(rel_start))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
303 out.append(str(rel_end))
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
304 out.append(blockdict[2]['types'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
305 out.append(blockdict[2]['lengths'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
306 out.append(blockdict[2]['counts'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
307 out.append(blockdict[2]['units'][ind2])
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
308 print >> fout, '\t'.join(out)
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
309 visited_2[ind2] = 1
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
310 out = []
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
311
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
312 #print >> fout, blockdict
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
313 except Exception, exc:
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
314 print >> sys.stderr, "type(exc),args,exc: %s, %s, %s" % ( type(exc), exc.args, exc )
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
315
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
316
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
317 if __name__ == "__main__":
d4368a5a3fc7 Imported from capsule None
devteam
parents:
diff changeset
318 main()