Mercurial > repos > devteam > microsats_alignment_level
annotate microsats_alignment_level.py @ 2:ad471b193191
Fix blank results when running on NFS.
| author | Dave Bouvier <dave@bx.psu.edu> | 
|---|---|
| date | Fri, 16 Jan 2015 14:51:32 -0500 | 
| parents | d4368a5a3fc7 | 
| children | df7548445f4e | 
| rev | line source | 
|---|---|
| 0 | 1 #!/usr/bin/env python | 
| 2 #Guruprasad Ananda | |
| 3 """ | |
| 4 Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output. | |
| 5 """ | |
| 6 import os | |
| 7 import re | |
| 8 import string | |
| 9 import sys | |
| 10 import tempfile | |
| 11 | |
| 12 def reverse_complement(text): | |
| 13 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
| 14 comp = [ch for ch in text.translate(DNA_COMP)] | |
| 15 comp.reverse() | |
| 16 return "".join(comp) | |
| 17 | |
| 18 | |
| 19 def main(): | |
| 20 if len(sys.argv) != 8: | |
| 21 print >> sys.stderr, "Insufficient number of arguments." | |
| 22 sys.exit() | |
| 23 | |
| 24 infile = open(sys.argv[1],'r') | |
| 25 separation = int(sys.argv[2]) | |
| 26 outfile = sys.argv[3] | |
| 27 mono_threshold = int(sys.argv[5]) | |
| 28 non_mono_threshold = int(sys.argv[6]) | |
| 29 allow_different_units = int(sys.argv[7]) | |
| 30 | |
| 31 print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" % ( separation, mono_threshold, non_mono_threshold, allow_different_units==1 ) | |
| 32 try: | |
| 33 fout = open(outfile, "w") | |
| 34 print >> fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit" | |
| 35 #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik") | |
| 36 sputnik_cmd = "sputnik" | |
| 37 input = infile.read() | |
| 38 block_num = 0 | |
| 39 input = input.replace('\r','\n') | |
| 40 for block in input.split('\n\n'): | |
| 41 block_num += 1 | |
| 42 tmpin = tempfile.NamedTemporaryFile() | |
| 43 tmpout = tempfile.NamedTemporaryFile() | |
| 44 tmpin.write(block.strip()) | |
| 
2
 
ad471b193191
Fix blank results when running on NFS.
 
Dave Bouvier <dave@bx.psu.edu> 
parents: 
0 
diff
changeset
 | 
45 tmpin.flush() | 
| 0 | 46 cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name | 
| 47 try: | |
| 48 os.system(cmdline) | |
| 49 except Exception: | |
| 50 continue | |
| 51 sputnik_out = tmpout.read() | |
| 52 tmpin.close() | |
| 53 tmpout.close() | |
| 54 if sputnik_out != "": | |
| 55 if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')): | |
| 56 continue | |
| 57 align_block = block.strip().split('>') | |
| 58 | |
| 59 lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6} | |
| 60 blockdict = {} | |
| 61 r = 0 | |
| 62 namelist = [] | |
| 63 for k, sput_block in enumerate(sputnik_out.split('>')[1:]): | |
| 64 whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip() | |
| 65 p = re.compile('\n(\S*nucleotide)') | |
| 66 repeats = p.split(sput_block.strip()) | |
| 67 repeats_count = len(repeats) | |
| 68 j = 1 | |
| 69 name = repeats[0].strip() | |
| 70 try: | |
| 71 coords = re.search('\d+[-_:]\d+', name).group() | |
| 72 coords = coords.replace('_', '-').replace(':', '-') | |
| 73 except Exception: | |
| 74 coords = '0-0' | |
| 75 r += 1 | |
| 76 blockdict[r] = {} | |
| 77 try: | |
| 78 sp_name = name[:name.index('.')] | |
| 79 chr_name = name[name.index('.'):name.index('(')] | |
| 80 namelist.append(sp_name + chr_name) | |
| 81 except: | |
| 82 namelist.append(name[:20]) | |
| 83 while j < repeats_count: | |
| 84 try: | |
| 85 if repeats[j].strip() not in lendict: | |
| 86 j += 2 | |
| 87 continue | |
| 88 | |
| 89 if blockdict[r].has_key('types'): | |
| 90 blockdict[r]['types'].append(repeats[j].strip()) #type of microsat | |
| 91 else: | |
| 92 blockdict[r]['types'] = [repeats[j].strip()] #type of microsat | |
| 93 | |
| 94 start = int(repeats[j+1].split('--')[0].split(':')[0].strip()) | |
| 95 #check to see if there are gaps before the start of the repeat, and change the start accordingly | |
| 96 sgaps = 0 | |
| 97 ch_pos = start - 1 | |
| 98 while ch_pos >= 0: | |
| 99 if whole_seq[ch_pos] == '-': | |
| 100 sgaps += 1 | |
| 101 else: | |
| 102 break #break at the 1st non-gap character | |
| 103 ch_pos -= 1 | |
| 104 if blockdict[r].has_key('starts'): | |
| 105 blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS | |
| 106 else: | |
| 107 blockdict[r]['starts'] = [start+sgaps] | |
| 108 | |
| 109 end = int(repeats[j+1].split('--')[0].split(':')[1].strip()) | |
| 110 #check to see if there are gaps after the end of the repeat, and change the end accordingly | |
| 111 egaps = 0 | |
| 112 for ch in whole_seq[end:]: | |
| 113 if ch == '-': | |
| 114 egaps += 1 | |
| 115 else: | |
| 116 break #break at the 1st non-gap character | |
| 117 if blockdict[r].has_key('ends'): | |
| 118 blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS | |
| 119 else: | |
| 120 blockdict[r]['ends'] = [end+egaps] | |
| 121 | |
| 122 repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence | |
| 123 repeat_len = repeats[j+1].split('--')[1].split()[1].strip() | |
| 124 gap_count = repeat_seq.count('-') | |
| 125 #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count | |
| 126 repeat_len = str(int(repeat_len) - gap_count) | |
| 127 | |
| 128 rel_start = blockdict[r]['starts'][-1] | |
| 129 gaps_before_start = whole_seq[:rel_start].count('-') | |
| 130 | |
| 131 if blockdict[r].has_key('gaps_before_start'): | |
| 132 blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths | |
| 133 else: | |
| 134 blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths | |
| 135 | |
| 136 whole_seq_start = int(coords.split('-')[0]) | |
| 137 if blockdict[r].has_key('whole_seq_start'): | |
| 138 blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths | |
| 139 else: | |
| 140 blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths | |
| 141 | |
| 142 if blockdict[r].has_key('lengths'): | |
| 143 blockdict[r]['lengths'].append(repeat_len) #lengths | |
| 144 else: | |
| 145 blockdict[r]['lengths'] = [repeat_len] #lengths | |
| 146 | |
| 147 if blockdict[r].has_key('counts'): | |
| 148 blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit | |
| 149 else: | |
| 150 blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit | |
| 151 | |
| 152 if blockdict[r].has_key('units'): | |
| 153 blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit | |
| 154 else: | |
| 155 blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit | |
| 156 | |
| 157 except Exception: | |
| 158 pass | |
| 159 j += 2 | |
| 160 #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'. | |
| 161 delete_index_list = [] | |
| 162 for ind, item in enumerate(blockdict[r]['ends']): | |
| 163 try: | |
| 164 if blockdict[r]['starts'][ind+1]-item < separation: | |
| 165 if ind not in delete_index_list: | |
| 166 delete_index_list.append(ind) | |
| 167 if ind+1 not in delete_index_list: | |
| 168 delete_index_list.append(ind+1) | |
| 169 except Exception: | |
| 170 pass | |
| 171 for index in delete_index_list: #mark them for deletion | |
| 172 try: | |
| 173 blockdict[r]['starts'][index] = 'marked' | |
| 174 blockdict[r]['ends'][index] = 'marked' | |
| 175 blockdict[r]['types'][index] = 'marked' | |
| 176 blockdict[r]['gaps_before_start'][index] = 'marked' | |
| 177 blockdict[r]['whole_seq_start'][index] = 'marked' | |
| 178 blockdict[r]['lengths'][index] = 'marked' | |
| 179 blockdict[r]['counts'][index] = 'marked' | |
| 180 blockdict[r]['units'][index] = 'marked' | |
| 181 except Exception: | |
| 182 pass | |
| 183 #remove 'marked' elements from all the lists | |
| 184 """ | |
| 185 for key in blockdict[r].keys(): | |
| 186 for elem in blockdict[r][key]: | |
| 187 if elem == 'marked': | |
| 188 blockdict[r][key].remove(elem) | |
| 189 """ | |
| 190 #print blockdict | |
| 191 | |
| 192 #make sure that the blockdict has keys for both the species | |
| 193 if (1 not in blockdict) or (2 not in blockdict): | |
| 194 continue | |
| 195 | |
| 196 visited_2 = [0 for x in range(len(blockdict[2]['starts']))] | |
| 197 for ind1, coord_s1 in enumerate(blockdict[1]['starts']): | |
| 198 if coord_s1 == 'marked': | |
| 199 continue | |
| 200 coord_e1 = blockdict[1]['ends'][ind1] | |
| 201 out = [] | |
| 202 for ind2, coord_s2 in enumerate(blockdict[2]['starts']): | |
| 203 if coord_s2 == 'marked': | |
| 204 visited_2[ind2] = 1 | |
| 205 continue | |
| 206 coord_e2 = blockdict[2]['ends'][ind2] | |
| 207 #skip if the 2 repeats are not of the same type or don't have the same repeating unit. | |
| 208 if allow_different_units == 0: | |
| 209 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): | |
| 210 continue | |
| 211 else: | |
| 212 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2): | |
| 213 continue | |
| 214 #print >> sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2) | |
| 215 #skip if the repeat number thresholds are not met | |
| 216 if blockdict[1]['types'][ind1] == 'mononucleotide': | |
| 217 if (int(blockdict[1]['counts'][ind1]) < mono_threshold): | |
| 218 continue | |
| 219 else: | |
| 220 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): | |
| 221 continue | |
| 222 | |
| 223 if blockdict[2]['types'][ind2] == 'mononucleotide': | |
| 224 if (int(blockdict[2]['counts'][ind2]) < mono_threshold): | |
| 225 continue | |
| 226 else: | |
| 227 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): | |
| 228 continue | |
| 229 #print "s1,e1=%s,%s; s2,e2=%s,%s" % ( coord_s1, coord_e1, coord_s2, coord_e2 ) | |
| 230 if (coord_s1 in range(coord_s2, coord_e2)) or (coord_e1 in range(coord_s2, coord_e2)): | |
| 231 out.append(str(block_num)) | |
| 232 out.append(namelist[0]) | |
| 233 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] | |
| 234 rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) | |
| 235 out.append(str(rel_start)) | |
| 236 out.append(str(rel_end)) | |
| 237 out.append(blockdict[1]['types'][ind1]) | |
| 238 out.append(blockdict[1]['lengths'][ind1]) | |
| 239 out.append(blockdict[1]['counts'][ind1]) | |
| 240 out.append(blockdict[1]['units'][ind1]) | |
| 241 out.append(namelist[1]) | |
| 242 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] | |
| 243 rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) | |
| 244 out.append(str(rel_start)) | |
| 245 out.append(str(rel_end)) | |
| 246 out.append(blockdict[2]['types'][ind2]) | |
| 247 out.append(blockdict[2]['lengths'][ind2]) | |
| 248 out.append(blockdict[2]['counts'][ind2]) | |
| 249 out.append(blockdict[2]['units'][ind2]) | |
| 250 print >> fout, '\t'.join(out) | |
| 251 visited_2[ind2] = 1 | |
| 252 out = [] | |
| 253 | |
| 254 if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet. | |
| 255 for ind2, coord_s2 in enumerate(blockdict[2]['starts']): | |
| 256 if coord_s2 == 'marked': | |
| 257 continue | |
| 258 if visited_2[ind] != 0: | |
| 259 continue | |
| 260 coord_e2 = blockdict[2]['ends'][ind2] | |
| 261 out = [] | |
| 262 for ind1, coord_s1 in enumerate(blockdict[1]['starts']): | |
| 263 if coord_s1 == 'marked': | |
| 264 continue | |
| 265 coord_e1 = blockdict[1]['ends'][ind1] | |
| 266 #skip if the 2 repeats are not of the same type or don't have the same repeating unit. | |
| 267 if allow_different_units == 0: | |
| 268 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): | |
| 269 continue | |
| 270 else: | |
| 271 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2: | |
| 272 continue | |
| 273 #skip if the repeat number thresholds are not met | |
| 274 if blockdict[1]['types'][ind1] == 'mononucleotide': | |
| 275 if (int(blockdict[1]['counts'][ind1]) < mono_threshold): | |
| 276 continue | |
| 277 else: | |
| 278 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): | |
| 279 continue | |
| 280 | |
| 281 if blockdict[2]['types'][ind2] == 'mononucleotide': | |
| 282 if (int(blockdict[2]['counts'][ind2]) < mono_threshold): | |
| 283 continue | |
| 284 else: | |
| 285 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): | |
| 286 continue | |
| 287 | |
| 288 if (coord_s2 in range(coord_s1, coord_e1)) or (coord_e2 in range(coord_s1, coord_e1)): | |
| 289 out.append(str(block_num)) | |
| 290 out.append(namelist[0]) | |
| 291 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] | |
| 292 rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) | |
| 293 out.append(str(rel_start)) | |
| 294 out.append(str(rel_end)) | |
| 295 out.append(blockdict[1]['types'][ind1]) | |
| 296 out.append(blockdict[1]['lengths'][ind1]) | |
| 297 out.append(blockdict[1]['counts'][ind1]) | |
| 298 out.append(blockdict[1]['units'][ind1]) | |
| 299 out.append(namelist[1]) | |
| 300 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] | |
| 301 rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) | |
| 302 out.append(str(rel_start)) | |
| 303 out.append(str(rel_end)) | |
| 304 out.append(blockdict[2]['types'][ind2]) | |
| 305 out.append(blockdict[2]['lengths'][ind2]) | |
| 306 out.append(blockdict[2]['counts'][ind2]) | |
| 307 out.append(blockdict[2]['units'][ind2]) | |
| 308 print >> fout, '\t'.join(out) | |
| 309 visited_2[ind2] = 1 | |
| 310 out = [] | |
| 311 | |
| 312 #print >> fout, blockdict | |
| 313 except Exception, exc: | |
| 314 print >> sys.stderr, "type(exc),args,exc: %s, %s, %s" % ( type(exc), exc.args, exc ) | |
| 315 | |
| 316 | |
| 317 if __name__ == "__main__": | |
| 318 main() | 
