Mercurial > repos > xuebing > sharplabtool
comparison tools/regVariation/microsats_mutability.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Guruprasad Ananda | |
| 3 """ | |
| 4 This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool. | |
| 5 """ | |
| 6 from galaxy import eggs | |
| 7 import sys, string, re, commands, tempfile, os, fileinput | |
| 8 from galaxy.tools.util.galaxyops import * | |
| 9 from bx.intervals.io import * | |
| 10 from bx.intervals.operations import quicksect | |
| 11 | |
| 12 fout = open(sys.argv[2],'w') | |
| 13 p_group = int(sys.argv[3]) #primary "group-by" feature | |
| 14 p_bin_size = int(sys.argv[4]) | |
| 15 s_group = int(sys.argv[5]) #sub-group by feature | |
| 16 s_bin_size = int(sys.argv[6]) | |
| 17 mono_threshold = 9 | |
| 18 non_mono_threshold = 4 | |
| 19 p_group_cols = [p_group, p_group+7] | |
| 20 s_group_cols = [s_group, s_group+7] | |
| 21 num_generations = int(sys.argv[7]) | |
| 22 region = sys.argv[8] | |
| 23 int_file = sys.argv[9] | |
| 24 if int_file != "None": #User has specified an interval file | |
| 25 try: | |
| 26 fint = open(int_file, 'r') | |
| 27 dbkey_i = sys.argv[10] | |
| 28 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] ) | |
| 29 except: | |
| 30 stop_err("Unable to open input Interval file") | |
| 31 | |
| 32 def stop_err(msg): | |
| 33 sys.stderr.write(msg) | |
| 34 sys.exit() | |
| 35 | |
| 36 def reverse_complement(text): | |
| 37 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
| 38 comp = [ch for ch in text.translate(DNA_COMP)] | |
| 39 comp.reverse() | |
| 40 return "".join(comp) | |
| 41 | |
| 42 def get_unique_elems(elems): | |
| 43 seen=set() | |
| 44 return[x for x in elems if x not in seen and not seen.add(x)] | |
| 45 | |
| 46 def get_binned_lists(uniqlist, binsize): | |
| 47 binnedlist=[] | |
| 48 uniqlist.sort() | |
| 49 start = int(uniqlist[0]) | |
| 50 bin_ind=0 | |
| 51 l_ind=0 | |
| 52 binnedlist.append([]) | |
| 53 while l_ind < len(uniqlist): | |
| 54 elem = int(uniqlist[l_ind]) | |
| 55 if elem in range(start,start+binsize): | |
| 56 binnedlist[bin_ind].append(elem) | |
| 57 else: | |
| 58 start += binsize | |
| 59 bin_ind += 1 | |
| 60 binnedlist.append([]) | |
| 61 binnedlist[bin_ind].append(elem) | |
| 62 l_ind += 1 | |
| 63 return binnedlist | |
| 64 | |
| 65 def fetch_weight(H,C,t): | |
| 66 if (H-(C-H)) < t: | |
| 67 return 2.0 | |
| 68 else: | |
| 69 return 1.0 | |
| 70 | |
| 71 def mutabilityEstimator(repeats1,repeats2,thresholds): | |
| 72 mut_num = 0.0 #Mutability Numerator | |
| 73 mut_den = 0.0 #Mutability denominator | |
| 74 for ind,H in enumerate(repeats1): | |
| 75 C = repeats2[ind] | |
| 76 t = thresholds[ind] | |
| 77 w = fetch_weight(H,C,t) | |
| 78 mut_num += ((H-C)*(H-C)*w) | |
| 79 mut_den += w | |
| 80 return [mut_num, mut_den] | |
| 81 | |
| 82 def output_writer(blk, blk_lines): | |
| 83 global winspecies, speciesind | |
| 84 all_elems_1=[] | |
| 85 all_elems_2=[] | |
| 86 all_s_elems_1=[] | |
| 87 all_s_elems_2=[] | |
| 88 for bline in blk_lines: | |
| 89 if not(bline): | |
| 90 continue | |
| 91 items = bline.split('\t') | |
| 92 seq1 = items[1] | |
| 93 start1 = items[2] | |
| 94 end1 = items[3] | |
| 95 seq2 = items[8] | |
| 96 start2 = items[9] | |
| 97 end2 = items[10] | |
| 98 if p_group_cols[0] == 6: | |
| 99 items[p_group_cols[0]] = int(items[p_group_cols[0]]) | |
| 100 items[p_group_cols[1]] = int(items[p_group_cols[1]]) | |
| 101 if s_group_cols[0] == 6: | |
| 102 items[s_group_cols[0]] = int(items[s_group_cols[0]]) | |
| 103 items[s_group_cols[1]] = int(items[s_group_cols[1]]) | |
| 104 all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1 | |
| 105 all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2 | |
| 106 if s_group_cols[0] != -1: #sub-group is not None | |
| 107 all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1 | |
| 108 all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2 | |
| 109 uniq_elems_1 = get_unique_elems(all_elems_1) | |
| 110 uniq_elems_2 = get_unique_elems(all_elems_2) | |
| 111 if s_group_cols[0] != -1: | |
| 112 uniq_s_elems_1 = get_unique_elems(all_s_elems_1) | |
| 113 uniq_s_elems_2 = get_unique_elems(all_s_elems_2) | |
| 114 mut1={} | |
| 115 mut2={} | |
| 116 count1 = {} | |
| 117 count2 = {} | |
| 118 """ | |
| 119 if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc) | |
| 120 uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y))) | |
| 121 """ | |
| 122 if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number. | |
| 123 uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size) | |
| 124 uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size) | |
| 125 | |
| 126 if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number. | |
| 127 uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size) | |
| 128 uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size) | |
| 129 | |
| 130 for pitem1 in uniq_elems_1: | |
| 131 #repeats1 = [] | |
| 132 #repeats2 = [] | |
| 133 thresholds = [] | |
| 134 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
| 135 for sitem1 in uniq_s_elems_1: | |
| 136 repeats1 = [] | |
| 137 repeats2 = [] | |
| 138 if type(sitem1) == type(''): | |
| 139 sitem1 = sitem1.strip() | |
| 140 for bline in blk_lines: | |
| 141 belems = bline.split('\t') | |
| 142 if type(pitem1) == list: | |
| 143 if p_group_cols[0] == 6: | |
| 144 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
| 145 if belems[p_group_cols[0]] in pitem1: | |
| 146 if belems[s_group_cols[0]]==sitem1: | |
| 147 repeats1.append(int(belems[6])) | |
| 148 repeats2.append(int(belems[13])) | |
| 149 if belems[4] == 'mononucleotide': | |
| 150 thresholds.append(mono_threshold) | |
| 151 else: | |
| 152 thresholds.append(non_mono_threshold) | |
| 153 mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
| 154 if region == 'align': | |
| 155 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
| 156 else: | |
| 157 if winspecies == 1: | |
| 158 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) | |
| 159 elif winspecies == 2: | |
| 160 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) | |
| 161 else: | |
| 162 if type(sitem1) == list: | |
| 163 if s_group_cols[0] == 6: | |
| 164 belems[s_group_cols[0]] = int(belems[s_group_cols[0]]) | |
| 165 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1: | |
| 166 repeats1.append(int(belems[6])) | |
| 167 repeats2.append(int(belems[13])) | |
| 168 if belems[4] == 'mononucleotide': | |
| 169 thresholds.append(mono_threshold) | |
| 170 else: | |
| 171 thresholds.append(non_mono_threshold) | |
| 172 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
| 173 if region == 'align': | |
| 174 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
| 175 else: | |
| 176 if winspecies == 1: | |
| 177 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1) | |
| 178 elif winspecies == 2: | |
| 179 count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2) | |
| 180 else: | |
| 181 if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1: | |
| 182 repeats1.append(int(belems[6])) | |
| 183 repeats2.append(int(belems[13])) | |
| 184 if belems[4] == 'mononucleotide': | |
| 185 thresholds.append(mono_threshold) | |
| 186 else: | |
| 187 thresholds.append(non_mono_threshold) | |
| 188 mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
| 189 if region == 'align': | |
| 190 count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2)) | |
| 191 else: | |
| 192 if winspecies == 1: | |
| 193 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1) | |
| 194 elif winspecies == 2: | |
| 195 count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2) | |
| 196 else: #Sub-group by feature is None | |
| 197 for bline in blk_lines: | |
| 198 belems = bline.split('\t') | |
| 199 if type(pitem1) == list: | |
| 200 #print >>sys.stderr, "item: " + str(item1) | |
| 201 if p_group_cols[0] == 6: | |
| 202 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
| 203 if belems[p_group_cols[0]] in pitem1: | |
| 204 repeats1.append(int(belems[6])) | |
| 205 repeats2.append(int(belems[13])) | |
| 206 if belems[4] == 'mononucleotide': | |
| 207 thresholds.append(mono_threshold) | |
| 208 else: | |
| 209 thresholds.append(non_mono_threshold) | |
| 210 else: | |
| 211 if belems[p_group_cols[0]]==pitem1: | |
| 212 repeats1.append(int(belems[6])) | |
| 213 repeats2.append(int(belems[13])) | |
| 214 if belems[4] == 'mononucleotide': | |
| 215 thresholds.append(mono_threshold) | |
| 216 else: | |
| 217 thresholds.append(non_mono_threshold) | |
| 218 mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds) | |
| 219 if region == 'align': | |
| 220 count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2)) | |
| 221 else: | |
| 222 if winspecies == 1: | |
| 223 count1[str(pitem1)]=sum(repeats1) | |
| 224 elif winspecies == 2: | |
| 225 count1[str(pitem1)]=sum(repeats2) | |
| 226 | |
| 227 for pitem2 in uniq_elems_2: | |
| 228 #repeats1 = [] | |
| 229 #repeats2 = [] | |
| 230 thresholds = [] | |
| 231 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
| 232 for sitem2 in uniq_s_elems_2: | |
| 233 repeats1 = [] | |
| 234 repeats2 = [] | |
| 235 if type(sitem2)==type(''): | |
| 236 sitem2 = sitem2.strip() | |
| 237 for bline in blk_lines: | |
| 238 belems = bline.split('\t') | |
| 239 if type(pitem2) == list: | |
| 240 if p_group_cols[0] == 6: | |
| 241 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
| 242 if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2: | |
| 243 repeats2.append(int(belems[13])) | |
| 244 repeats1.append(int(belems[6])) | |
| 245 if belems[4] == 'mononucleotide': | |
| 246 thresholds.append(mono_threshold) | |
| 247 else: | |
| 248 thresholds.append(non_mono_threshold) | |
| 249 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
| 250 #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2) | |
| 251 if region == 'align': | |
| 252 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
| 253 else: | |
| 254 if winspecies == 1: | |
| 255 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
| 256 elif winspecies == 2: | |
| 257 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
| 258 else: | |
| 259 if type(sitem2) == list: | |
| 260 if s_group_cols[0] == 6: | |
| 261 belems[s_group_cols[1]] = int(belems[s_group_cols[1]]) | |
| 262 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2: | |
| 263 repeats2.append(int(belems[13])) | |
| 264 repeats1.append(int(belems[6])) | |
| 265 if belems[4] == 'mononucleotide': | |
| 266 thresholds.append(mono_threshold) | |
| 267 else: | |
| 268 thresholds.append(non_mono_threshold) | |
| 269 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
| 270 if region == 'align': | |
| 271 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
| 272 else: | |
| 273 if winspecies == 1: | |
| 274 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
| 275 elif winspecies == 2: | |
| 276 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
| 277 else: | |
| 278 if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2: | |
| 279 repeats1.append(int(belems[13])) | |
| 280 repeats2.append(int(belems[6])) | |
| 281 if belems[4] == 'mononucleotide': | |
| 282 thresholds.append(mono_threshold) | |
| 283 else: | |
| 284 thresholds.append(non_mono_threshold) | |
| 285 mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
| 286 if region == 'align': | |
| 287 count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2)) | |
| 288 else: | |
| 289 if winspecies == 1: | |
| 290 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2) | |
| 291 elif winspecies == 2: | |
| 292 count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1) | |
| 293 else: #Sub-group by feature is None | |
| 294 for bline in blk_lines: | |
| 295 belems = bline.split('\t') | |
| 296 if type(pitem2) == list: | |
| 297 if p_group_cols[0] == 6: | |
| 298 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
| 299 if belems[p_group_cols[1]] in pitem2: | |
| 300 repeats2.append(int(belems[13])) | |
| 301 repeats1.append(int(belems[6])) | |
| 302 if belems[4] == 'mononucleotide': | |
| 303 thresholds.append(mono_threshold) | |
| 304 else: | |
| 305 thresholds.append(non_mono_threshold) | |
| 306 else: | |
| 307 if belems[p_group_cols[1]]==pitem2: | |
| 308 repeats2.append(int(belems[13])) | |
| 309 repeats1.append(int(belems[6])) | |
| 310 if belems[4] == 'mononucleotide': | |
| 311 thresholds.append(mono_threshold) | |
| 312 else: | |
| 313 thresholds.append(non_mono_threshold) | |
| 314 mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds) | |
| 315 if region == 'align': | |
| 316 count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2)) | |
| 317 else: | |
| 318 if winspecies == 1: | |
| 319 count2["%s" %(pitem2)]=sum(repeats2) | |
| 320 elif winspecies == 2: | |
| 321 count2["%s" %(pitem2)]=sum(repeats1) | |
| 322 for key in mut1.keys(): | |
| 323 if key in mut2.keys(): | |
| 324 mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1]) | |
| 325 count = count1[key] | |
| 326 del mut2[key] | |
| 327 else: | |
| 328 unit_found = False | |
| 329 if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too | |
| 330 if p_group_cols[0] == 7: | |
| 331 this,other = 0,1 | |
| 332 else: | |
| 333 this,other = 1,0 | |
| 334 groups1 = key.split('\t') | |
| 335 mutn = mut1[key][0] | |
| 336 mutd = mut1[key][1] | |
| 337 count = 0 | |
| 338 for key2 in mut2.keys(): | |
| 339 groups2 = key2.split('\t') | |
| 340 if groups1[other] == groups2[other]: | |
| 341 if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2: | |
| 342 #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1]) | |
| 343 mutn += mut2[key2][0] | |
| 344 mutd += mut2[key2][1] | |
| 345 count += int(count2[key2]) | |
| 346 unit_found = True | |
| 347 del mut2[key2] | |
| 348 #break | |
| 349 if unit_found: | |
| 350 mut = mutn/mutd | |
| 351 else: | |
| 352 mut = mut1[key][0]/mut1[key][1] | |
| 353 count = count1[key] | |
| 354 mut = "%.2e" %(mut/num_generations) | |
| 355 if region == 'align': | |
| 356 print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count) | |
| 357 elif region == 'win': | |
| 358 fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count)) | |
| 359 fout.flush() | |
| 360 | |
| 361 #catch any remaining repeats, for instance if the orthologous position contained different repeat units | |
| 362 for remaining_key in mut2.keys(): | |
| 363 mut = mut2[remaining_key][0]/mut2[remaining_key][1] | |
| 364 mut = "%.2e" %(mut/num_generations) | |
| 365 count = count2[remaining_key] | |
| 366 if region == 'align': | |
| 367 print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
| 368 elif region == 'win': | |
| 369 fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count)) | |
| 370 fout.flush() | |
| 371 #print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
| 372 | |
| 373 def counter(node, start, end, report_func): | |
| 374 if start <= node.start < end and start < node.end <= end: | |
| 375 report_func(node) | |
| 376 if node.right: | |
| 377 counter(node.right, start, end, report_func) | |
| 378 if node.left: | |
| 379 counter(node.left, start, end, report_func) | |
| 380 elif node.start < start and node.right: | |
| 381 counter(node.right, start, end, report_func) | |
| 382 elif node.start >= end and node.left and node.left.maxend > start: | |
| 383 counter(node.left, start, end, report_func) | |
| 384 | |
| 385 | |
| 386 def main(): | |
| 387 infile = sys.argv[1] | |
| 388 | |
| 389 for i, line in enumerate( file ( infile )): | |
| 390 line = line.rstrip('\r\n') | |
| 391 if len( line )>0 and not line.startswith( '#' ): | |
| 392 elems = line.split( '\t' ) | |
| 393 break | |
| 394 if i == 30: | |
| 395 break # Hopefully we'll never get here... | |
| 396 | |
| 397 if len( elems ) != 15: | |
| 398 stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." ) | |
| 399 global winspecies, speciesind | |
| 400 if region == 'win': | |
| 401 if dbkey_i in elems[1]: | |
| 402 winspecies = 1 | |
| 403 speciesind = 1 | |
| 404 elif dbkey_i in elems[8]: | |
| 405 winspecies = 2 | |
| 406 speciesind = 8 | |
| 407 else: | |
| 408 stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") | |
| 409 | |
| 410 fin = open(infile, 'r') | |
| 411 skipped = 0 | |
| 412 blk=0 | |
| 413 win=0 | |
| 414 linestr="" | |
| 415 | |
| 416 if region == 'win': | |
| 417 | |
| 418 msats = NiceReaderWrapper( fileinput.FileInput( infile ), | |
| 419 chrom_col = speciesind, | |
| 420 start_col = speciesind+1, | |
| 421 end_col = speciesind+2, | |
| 422 strand_col = -1, | |
| 423 fix_strand = True) | |
| 424 msatTree = quicksect.IntervalTree() | |
| 425 for item in msats: | |
| 426 if type( item ) is GenomicInterval: | |
| 427 msatTree.insert( item, msats.linenum, item.fields ) | |
| 428 | |
| 429 for iline in fint: | |
| 430 try: | |
| 431 iline = iline.rstrip('\r\n') | |
| 432 if not(iline) or iline == "": | |
| 433 continue | |
| 434 ielems = iline.strip("\r\n").split('\t') | |
| 435 ichr = ielems[chr_col_i] | |
| 436 istart = int(ielems[start_col_i]) | |
| 437 iend = int(ielems[end_col_i]) | |
| 438 isrc = "%s.%s" %(dbkey_i,ichr) | |
| 439 if isrc not in msatTree.chroms: | |
| 440 continue | |
| 441 result = [] | |
| 442 root = msatTree.chroms[isrc] #root node for the chrom | |
| 443 counter(root, istart, iend, lambda node: result.append( node )) | |
| 444 if not(result): | |
| 445 continue | |
| 446 tmpfile1 = tempfile.NamedTemporaryFile('wb+') | |
| 447 for node in result: | |
| 448 tmpfile1.write("%s\n" % "\t".join( node.other )) | |
| 449 | |
| 450 tmpfile1.seek(0) | |
| 451 output_writer(iline, tmpfile1.readlines()) | |
| 452 except: | |
| 453 skipped+=1 | |
| 454 if skipped: | |
| 455 print "Skipped %d intervals as invalid." %(skipped) | |
| 456 elif region == 'align': | |
| 457 if s_group_cols[0] != -1: | |
| 458 print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount" | |
| 459 else: | |
| 460 print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount" | |
| 461 prev_bnum = -1 | |
| 462 try: | |
| 463 for line in fin: | |
| 464 line = line.strip("\r\n") | |
| 465 if not(line) or line == "": | |
| 466 continue | |
| 467 elems = line.split('\t') | |
| 468 try: | |
| 469 assert int(elems[0]) | |
| 470 assert len(elems) == 15 | |
| 471 except: | |
| 472 continue | |
| 473 new_bnum = int(elems[0]) | |
| 474 if new_bnum != prev_bnum: | |
| 475 if prev_bnum != -1: | |
| 476 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
| 477 linestr = line + "\n" | |
| 478 else: | |
| 479 linestr += line | |
| 480 linestr += "\n" | |
| 481 prev_bnum = new_bnum | |
| 482 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
| 483 except Exception, ea: | |
| 484 print >>sys.stderr, ea | |
| 485 skipped += 1 | |
| 486 if skipped: | |
| 487 print "Skipped %d lines as invalid." %(skipped) | |
| 488 if __name__ == "__main__": | |
| 489 main() |
