Mercurial > repos > devteam > microsats_mutability
comparison microsats_mutability.py @ 0:4aa1ee5d8510 draft
Imported from capsule None
| author | devteam |
|---|---|
| date | Tue, 01 Apr 2014 10:52:29 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4aa1ee5d8510 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Guruprasad Ananda | |
| 3 """ | |
| 4 This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool. | |
| 5 """ | |
| 6 import fileinput | |
| 7 import string | |
| 8 import sys | |
| 9 import tempfile | |
| 10 from galaxy.tools.util.galaxyops import * | |
| 11 from bx.intervals.io import * | |
| 12 from bx.intervals.operations import quicksect | |
| 13 | |
| 14 fout = open(sys.argv[2],'w') | |
| 15 p_group = int(sys.argv[3]) #primary "group-by" feature | |
| 16 p_bin_size = int(sys.argv[4]) | |
| 17 s_group = int(sys.argv[5]) #sub-group by feature | |
| 18 s_bin_size = int(sys.argv[6]) | |
| 19 mono_threshold = 9 | |
| 20 non_mono_threshold = 4 | |
| 21 p_group_cols = [p_group, p_group+7] | |
| 22 s_group_cols = [s_group, s_group+7] | |
| 23 num_generations = int(sys.argv[7]) | |
| 24 region = sys.argv[8] | |
| 25 int_file = sys.argv[9] | |
| 26 if int_file != "None": #User has specified an interval file | |
| 27 try: | |
| 28 fint = open(int_file, 'r') | |
| 29 dbkey_i = sys.argv[10] | |
| 30 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] ) | |
| 31 except: | |
| 32 stop_err("Unable to open input Interval file") | |
| 33 | |
| 34 | |
| 35 def stop_err(msg): | |
| 36 sys.stderr.write(msg) | |
| 37 sys.exit() | |
| 38 | |
| 39 | |
| 40 def reverse_complement(text): | |
| 41 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
| 42 comp = [ch for ch in text.translate(DNA_COMP)] | |
| 43 comp.reverse() | |
| 44 return "".join(comp) | |
| 45 | |
| 46 | |
| 47 def get_unique_elems(elems): | |
| 48 seen = set() | |
| 49 return[x for x in elems if x not in seen and not seen.add(x)] | |
| 50 | |
| 51 | |
| 52 def get_binned_lists(uniqlist, binsize): | |
| 53 binnedlist = [] | |
| 54 uniqlist.sort() | |
| 55 start = int(uniqlist[0]) | |
| 56 bin_ind = 0 | |
| 57 l_ind = 0 | |
| 58 binnedlist.append([]) | |
| 59 while l_ind < len(uniqlist): | |
| 60 elem = int(uniqlist[l_ind]) | |
| 61 if elem in range(start, start+binsize): | |
| 62 binnedlist[bin_ind].append(elem) | |
| 63 else: | |
| 64 start += binsize | |
| 65 bin_ind += 1 | |
| 66 binnedlist.append([]) | |
| 67 binnedlist[bin_ind].append(elem) | |
| 68 l_ind += 1 | |
| 69 return binnedlist | |
| 70 | |
| 71 | |
| 72 def fetch_weight(H, C, t): | |
| 73 if (H-(C-H)) < t: | |
| 74 return 2.0 | |
| 75 else: | |
| 76 return 1.0 | |
| 77 | |
| 78 | |
| 79 def mutabilityEstimator(repeats1, repeats2, thresholds): | |
| 80 mut_num = 0.0 #Mutability Numerator | |
| 81 mut_den = 0.0 #Mutability denominator | |
| 82 for ind, H in enumerate(repeats1): | |
| 83 C = repeats2[ind] | |
| 84 t = thresholds[ind] | |
| 85 w = fetch_weight(H, C, t) | |
| 86 mut_num += ((H-C)*(H-C)*w) | |
| 87 mut_den += w | |
| 88 return [mut_num, mut_den] | |
| 89 | |
| 90 | |
| 91 def output_writer(blk, blk_lines): | |
| 92 global winspecies, speciesind | |
| 93 all_elems_1 = [] | |
| 94 all_elems_2 = [] | |
| 95 all_s_elems_1 = [] | |
| 96 all_s_elems_2 = [] | |
| 97 for bline in blk_lines: | |
| 98 if not(bline): | |
| 99 continue | |
| 100 items = bline.split('\t') | |
| 101 seq1 = items[1] | |
| 102 seq2 = items[8] | |
| 103 if p_group_cols[0] == 6: | |
| 104 items[p_group_cols[0]] = int(items[p_group_cols[0]]) | |
| 105 items[p_group_cols[1]] = int(items[p_group_cols[1]]) | |
| 106 if s_group_cols[0] == 6: | |
| 107 items[s_group_cols[0]] = int(items[s_group_cols[0]]) | |
| 108 items[s_group_cols[1]] = int(items[s_group_cols[1]]) | |
| 109 all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1 | |
| 110 all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2 | |
| 111 if s_group_cols[0] != -1: #sub-group is not None | |
| 112 all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1 | |
| 113 all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2 | |
| 114 uniq_elems_1 = get_unique_elems(all_elems_1) | |
| 115 uniq_elems_2 = get_unique_elems(all_elems_2) | |
| 116 if s_group_cols[0] != -1: | |
| 117 uniq_s_elems_1 = get_unique_elems(all_s_elems_1) | |
| 118 uniq_s_elems_2 = get_unique_elems(all_s_elems_2) | |
| 119 mut1 = {} | |
| 120 mut2 = {} | |
| 121 count1 = {} | |
| 122 count2 = {} | |
| 123 """ | |
| 124 if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc) | |
| 125 uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y))) | |
| 126 """ | |
| 127 if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number. | |
| 128 uniq_elems_1 = get_binned_lists( uniq_elems_1, p_bin_size ) | |
| 129 uniq_elems_2 = get_binned_lists( uniq_elems_2, p_bin_size ) | |
| 130 | |
| 131 if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number. | |
| 132 uniq_s_elems_1 = get_binned_lists( uniq_s_elems_1, s_bin_size ) | |
| 133 uniq_s_elems_2 = get_binned_lists( uniq_s_elems_2, s_bin_size ) | |
| 134 | |
| 135 for pitem1 in uniq_elems_1: | |
| 136 #repeats1 = [] | |
| 137 #repeats2 = [] | |
| 138 thresholds = [] | |
| 139 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
| 140 for sitem1 in uniq_s_elems_1: | |
| 141 repeats1 = [] | |
| 142 repeats2 = [] | |
| 143 if type(sitem1) == type(''): | |
| 144 sitem1 = sitem1.strip() | |
| 145 for bline in blk_lines: | |
| 146 belems = bline.split('\t') | |
| 147 if type(pitem1) == list: | |
| 148 if p_group_cols[0] == 6: | |
| 149 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
| 150 if belems[p_group_cols[0]] in pitem1: | |
| 151 if belems[s_group_cols[0]] == sitem1: | |
| 152 repeats1.append(int(belems[6])) | |
| 153 repeats2.append(int(belems[13])) | |
| 154 if belems[4] == 'mononucleotide': | |
| 155 thresholds.append(mono_threshold) | |
| 156 else: | |
| 157 thresholds.append(non_mono_threshold) | |
| 158 mut1[str(pitem1)+'\t'+str(sitem1)] = mutabilityEstimator( repeats1, repeats2, thresholds ) | |
| 159 if region == 'align': | |
| 160 count1[str(pitem1)+'\t'+str(sitem1)] = min( sum(repeats1), sum(repeats2) ) | |
| 161 else: | |
| 162 if winspecies == 1: | |
| 163 count1["%s\t%s" % ( pitem1, sitem1 )] = sum(repeats1) | |
| 164 elif winspecies == 2: | |
| 165 count1["%s\t%s" % ( pitem1, sitem1 )] = sum(repeats2) | |
| 166 else: | |
| 167 if type(sitem1) == list: | |
| 168 if s_group_cols[0] == 6: | |
| 169 belems[s_group_cols[0]] = int(belems[s_group_cols[0]]) | |
| 170 if belems[p_group_cols[0]] == pitem1 and belems[s_group_cols[0]] in sitem1: | |
| 171 repeats1.append(int(belems[6])) | |
| 172 repeats2.append(int(belems[13])) | |
| 173 if belems[4] == 'mononucleotide': | |
| 174 thresholds.append(mono_threshold) | |
| 175 else: | |
| 176 thresholds.append(non_mono_threshold) | |
| 177 mut1["%s\t%s" % ( pitem1, sitem1 )] = mutabilityEstimator( repeats1, repeats2, thresholds ) | |
| 178 if region == 'align': | |
| 179 count1[str(pitem1)+'\t'+str(sitem1)] = min( sum(repeats1), sum(repeats2) ) | |
| 180 else: | |
| 181 if winspecies == 1: | |
| 182 count1[str(pitem1)+'\t'+str(sitem1)] = sum(repeats1) | |
| 183 elif winspecies == 2: | |
| 184 count1[str(pitem1)+'\t'+str(sitem1)] = sum(repeats2) | |
| 185 else: | |
| 186 if belems[p_group_cols[0]] == pitem1 and belems[s_group_cols[0]] == sitem1: | |
| 187 repeats1.append(int(belems[6])) | |
| 188 repeats2.append(int(belems[13])) | |
| 189 if belems[4] == 'mononucleotide': | |
| 190 thresholds.append(mono_threshold) | |
| 191 else: | |
| 192 thresholds.append(non_mono_threshold) | |
| 193 mut1["%s\t%s" % ( pitem1, sitem1 )] = mutabilityEstimator( repeats1, repeats2, thresholds ) | |
| 194 if region == 'align': | |
| 195 count1[str(pitem1)+'\t'+str(sitem1)] = min( sum(repeats1), sum(repeats2) ) | |
| 196 else: | |
| 197 if winspecies == 1: | |
| 198 count1["%s\t%s" % ( pitem1, sitem1 )] = sum(repeats1) | |
| 199 elif winspecies == 2: | |
| 200 count1["%s\t%s" % ( pitem1, sitem1 )] = sum(repeats2) | |
| 201 else: #Sub-group by feature is None | |
| 202 for bline in blk_lines: | |
| 203 belems = bline.split('\t') | |
| 204 if type(pitem1) == list: | |
| 205 #print >> sys.stderr, "item: " + str(item1) | |
| 206 if p_group_cols[0] == 6: | |
| 207 belems[p_group_cols[0]] = int(belems[p_group_cols[0]]) | |
| 208 if belems[p_group_cols[0]] in pitem1: | |
| 209 repeats1.append(int(belems[6])) | |
| 210 repeats2.append(int(belems[13])) | |
| 211 if belems[4] == 'mononucleotide': | |
| 212 thresholds.append(mono_threshold) | |
| 213 else: | |
| 214 thresholds.append(non_mono_threshold) | |
| 215 else: | |
| 216 if belems[p_group_cols[0]] == pitem1: | |
| 217 repeats1.append(int(belems[6])) | |
| 218 repeats2.append(int(belems[13])) | |
| 219 if belems[4] == 'mononucleotide': | |
| 220 thresholds.append(mono_threshold) | |
| 221 else: | |
| 222 thresholds.append(non_mono_threshold) | |
| 223 mut1["%s" % (pitem1)] = mutabilityEstimator( repeats1, repeats2, thresholds ) | |
| 224 if region == 'align': | |
| 225 count1["%s" % (pitem1)] = min( sum(repeats1), sum(repeats2) ) | |
| 226 else: | |
| 227 if winspecies == 1: | |
| 228 count1[str(pitem1)] = sum(repeats1) | |
| 229 elif winspecies == 2: | |
| 230 count1[str(pitem1)] = sum(repeats2) | |
| 231 | |
| 232 for pitem2 in uniq_elems_2: | |
| 233 #repeats1 = [] | |
| 234 #repeats2 = [] | |
| 235 thresholds = [] | |
| 236 if s_group_cols[0] != -1: #Sub-group by feature is not None | |
| 237 for sitem2 in uniq_s_elems_2: | |
| 238 repeats1 = [] | |
| 239 repeats2 = [] | |
| 240 if type(sitem2)==type(''): | |
| 241 sitem2 = sitem2.strip() | |
| 242 for bline in blk_lines: | |
| 243 belems = bline.split('\t') | |
| 244 if type(pitem2) == list: | |
| 245 if p_group_cols[0] == 6: | |
| 246 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
| 247 if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]] == sitem2: | |
| 248 repeats2.append(int(belems[13])) | |
| 249 repeats1.append(int(belems[6])) | |
| 250 if belems[4] == 'mononucleotide': | |
| 251 thresholds.append(mono_threshold) | |
| 252 else: | |
| 253 thresholds.append(non_mono_threshold) | |
| 254 mut2["%s\t%s" % ( pitem2, sitem2 )] = mutabilityEstimator( repeats2, repeats1, thresholds ) | |
| 255 #count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2) | |
| 256 if region == 'align': | |
| 257 count2["%s\t%s" % ( pitem2, sitem2 )] = min( sum(repeats1), sum(repeats2) ) | |
| 258 else: | |
| 259 if winspecies == 1: | |
| 260 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats2) | |
| 261 elif winspecies == 2: | |
| 262 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats1) | |
| 263 else: | |
| 264 if type(sitem2) == list: | |
| 265 if s_group_cols[0] == 6: | |
| 266 belems[s_group_cols[1]] = int(belems[s_group_cols[1]]) | |
| 267 if belems[p_group_cols[1]] == pitem2 and belems[s_group_cols[1]] in sitem2: | |
| 268 repeats2.append(int(belems[13])) | |
| 269 repeats1.append(int(belems[6])) | |
| 270 if belems[4] == 'mononucleotide': | |
| 271 thresholds.append(mono_threshold) | |
| 272 else: | |
| 273 thresholds.append(non_mono_threshold) | |
| 274 mut2["%s\t%s" % ( pitem2, sitem2 )] = mutabilityEstimator( repeats2, repeats1, thresholds ) | |
| 275 if region == 'align': | |
| 276 count2["%s\t%s" % ( pitem2, sitem2 )] = min( sum(repeats1), sum(repeats2) ) | |
| 277 else: | |
| 278 if winspecies == 1: | |
| 279 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats2) | |
| 280 elif winspecies == 2: | |
| 281 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats1) | |
| 282 else: | |
| 283 if belems[p_group_cols[1]] == pitem2 and belems[s_group_cols[1]] == sitem2: | |
| 284 repeats1.append(int(belems[13])) | |
| 285 repeats2.append(int(belems[6])) | |
| 286 if belems[4] == 'mononucleotide': | |
| 287 thresholds.append(mono_threshold) | |
| 288 else: | |
| 289 thresholds.append(non_mono_threshold) | |
| 290 mut2["%s\t%s" % ( pitem2, sitem2 )] = mutabilityEstimator( repeats2, repeats1, thresholds ) | |
| 291 if region == 'align': | |
| 292 count2["%s\t%s" % ( pitem2, sitem2 )] = min( sum(repeats1), sum(repeats2) ) | |
| 293 else: | |
| 294 if winspecies == 1: | |
| 295 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats2) | |
| 296 elif winspecies == 2: | |
| 297 count2["%s\t%s" % ( pitem2, sitem2 )] = len(repeats1) | |
| 298 else: #Sub-group by feature is None | |
| 299 for bline in blk_lines: | |
| 300 belems = bline.split('\t') | |
| 301 if type(pitem2) == list: | |
| 302 if p_group_cols[0] == 6: | |
| 303 belems[p_group_cols[1]] = int(belems[p_group_cols[1]]) | |
| 304 if belems[p_group_cols[1]] in pitem2: | |
| 305 repeats2.append(int(belems[13])) | |
| 306 repeats1.append(int(belems[6])) | |
| 307 if belems[4] == 'mononucleotide': | |
| 308 thresholds.append(mono_threshold) | |
| 309 else: | |
| 310 thresholds.append(non_mono_threshold) | |
| 311 else: | |
| 312 if belems[p_group_cols[1]] == pitem2: | |
| 313 repeats2.append(int(belems[13])) | |
| 314 repeats1.append(int(belems[6])) | |
| 315 if belems[4] == 'mononucleotide': | |
| 316 thresholds.append(mono_threshold) | |
| 317 else: | |
| 318 thresholds.append(non_mono_threshold) | |
| 319 mut2["%s" % (pitem2)] = mutabilityEstimator( repeats2, repeats1, thresholds ) | |
| 320 if region == 'align': | |
| 321 count2["%s" % (pitem2)] = min( sum(repeats1), sum(repeats2) ) | |
| 322 else: | |
| 323 if winspecies == 1: | |
| 324 count2["%s" % (pitem2)] = sum(repeats2) | |
| 325 elif winspecies == 2: | |
| 326 count2["%s" % (pitem2)] = sum(repeats1) | |
| 327 for key in mut1.keys(): | |
| 328 if key in mut2.keys(): | |
| 329 mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1]) | |
| 330 count = count1[key] | |
| 331 del mut2[key] | |
| 332 else: | |
| 333 unit_found = False | |
| 334 if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too | |
| 335 if p_group_cols[0] == 7: | |
| 336 this, other = 0, 1 | |
| 337 else: | |
| 338 this, other = 1, 0 | |
| 339 groups1 = key.split('\t') | |
| 340 mutn = mut1[key][0] | |
| 341 mutd = mut1[key][1] | |
| 342 count = 0 | |
| 343 for key2 in mut2.keys(): | |
| 344 groups2 = key2.split('\t') | |
| 345 if groups1[other] == groups2[other]: | |
| 346 if groups1[this] in groups2[this]*2 or reverse_complement(groups1[this]) in groups2[this]*2: | |
| 347 #mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1]) | |
| 348 mutn += mut2[key2][0] | |
| 349 mutd += mut2[key2][1] | |
| 350 count += int(count2[key2]) | |
| 351 unit_found = True | |
| 352 del mut2[key2] | |
| 353 #break | |
| 354 if unit_found: | |
| 355 mut = mutn/mutd | |
| 356 else: | |
| 357 mut = mut1[key][0]/mut1[key][1] | |
| 358 count = count1[key] | |
| 359 mut = "%.2e" % (mut/num_generations) | |
| 360 if region == 'align': | |
| 361 print >> fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count) | |
| 362 elif region == 'win': | |
| 363 fout.write("%s\t%s\t%s\t%s\n" % ( blk, key.strip(), mut, count )) | |
| 364 fout.flush() | |
| 365 | |
| 366 #catch any remaining repeats, for instance if the orthologous position contained different repeat units | |
| 367 for remaining_key in mut2.keys(): | |
| 368 mut = mut2[remaining_key][0]/mut2[remaining_key][1] | |
| 369 mut = "%.2e" % (mut/num_generations) | |
| 370 count = count2[remaining_key] | |
| 371 if region == 'align': | |
| 372 print >> fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
| 373 elif region == 'win': | |
| 374 fout.write("%s\t%s\t%s\t%s\n" % ( blk, remaining_key.strip(), mut, count )) | |
| 375 fout.flush() | |
| 376 #print >> fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count) | |
| 377 | |
| 378 | |
| 379 def counter(node, start, end, report_func): | |
| 380 if start <= node.start < end and start < node.end <= end: | |
| 381 report_func(node) | |
| 382 if node.right: | |
| 383 counter(node.right, start, end, report_func) | |
| 384 if node.left: | |
| 385 counter(node.left, start, end, report_func) | |
| 386 elif node.start < start and node.right: | |
| 387 counter(node.right, start, end, report_func) | |
| 388 elif node.start >= end and node.left and node.left.maxend > start: | |
| 389 counter(node.left, start, end, report_func) | |
| 390 | |
| 391 | |
| 392 def main(): | |
| 393 infile = sys.argv[1] | |
| 394 | |
| 395 for i, line in enumerate( file ( infile )): | |
| 396 line = line.rstrip('\r\n') | |
| 397 if len( line )>0 and not line.startswith( '#' ): | |
| 398 elems = line.split( '\t' ) | |
| 399 break | |
| 400 if i == 30: | |
| 401 break # Hopefully we'll never get here... | |
| 402 | |
| 403 if len( elems ) != 15: | |
| 404 stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." ) | |
| 405 global winspecies, speciesind | |
| 406 if region == 'win': | |
| 407 if dbkey_i in elems[1]: | |
| 408 winspecies = 1 | |
| 409 speciesind = 1 | |
| 410 elif dbkey_i in elems[8]: | |
| 411 winspecies = 2 | |
| 412 speciesind = 8 | |
| 413 else: | |
| 414 stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.") | |
| 415 | |
| 416 fin = open(infile, 'r') | |
| 417 skipped = 0 | |
| 418 linestr = "" | |
| 419 | |
| 420 if region == 'win': | |
| 421 msats = NiceReaderWrapper( fileinput.FileInput( infile ), | |
| 422 chrom_col = speciesind, | |
| 423 start_col = speciesind+1, | |
| 424 end_col = speciesind+2, | |
| 425 strand_col = -1, | |
| 426 fix_strand = True) | |
| 427 msatTree = quicksect.IntervalTree() | |
| 428 for item in msats: | |
| 429 if type( item ) is GenomicInterval: | |
| 430 msatTree.insert( item, msats.linenum, item.fields ) | |
| 431 | |
| 432 for iline in fint: | |
| 433 try: | |
| 434 iline = iline.rstrip('\r\n') | |
| 435 if not(iline) or iline == "": | |
| 436 continue | |
| 437 ielems = iline.strip("\r\n").split('\t') | |
| 438 ichr = ielems[chr_col_i] | |
| 439 istart = int(ielems[start_col_i]) | |
| 440 iend = int(ielems[end_col_i]) | |
| 441 isrc = "%s.%s" % ( dbkey_i, ichr ) | |
| 442 if isrc not in msatTree.chroms: | |
| 443 continue | |
| 444 result = [] | |
| 445 root = msatTree.chroms[isrc] #root node for the chrom | |
| 446 counter(root, istart, iend, lambda node: result.append( node )) | |
| 447 if not(result): | |
| 448 continue | |
| 449 tmpfile1 = tempfile.NamedTemporaryFile('wb+') | |
| 450 for node in result: | |
| 451 tmpfile1.write("%s\n" % "\t".join( node.other )) | |
| 452 | |
| 453 tmpfile1.seek(0) | |
| 454 output_writer(iline, tmpfile1.readlines()) | |
| 455 except: | |
| 456 skipped += 1 | |
| 457 if skipped: | |
| 458 print "Skipped %d intervals as invalid." % (skipped) | |
| 459 elif region == 'align': | |
| 460 if s_group_cols[0] != -1: | |
| 461 print >> fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount" | |
| 462 else: | |
| 463 print >> fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount" | |
| 464 prev_bnum = -1 | |
| 465 try: | |
| 466 for line in fin: | |
| 467 line = line.strip("\r\n") | |
| 468 if not(line) or line == "": | |
| 469 continue | |
| 470 elems = line.split('\t') | |
| 471 try: | |
| 472 assert int(elems[0]) | |
| 473 assert len(elems) == 15 | |
| 474 except: | |
| 475 continue | |
| 476 new_bnum = int(elems[0]) | |
| 477 if new_bnum != prev_bnum: | |
| 478 if prev_bnum != -1: | |
| 479 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
| 480 linestr = line + "\n" | |
| 481 else: | |
| 482 linestr += line | |
| 483 linestr += "\n" | |
| 484 prev_bnum = new_bnum | |
| 485 output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n')) | |
| 486 except Exception, ea: | |
| 487 print >> sys.stderr, ea | |
| 488 skipped += 1 | |
| 489 if skipped: | |
| 490 print "Skipped %d lines as invalid." % (skipped) | |
| 491 | |
| 492 | |
| 493 if __name__ == "__main__": | |
| 494 main() |
