Mercurial > repos > devteam > quality_filter
comparison quality_filter.py @ 0:8d65bbc52dfe draft
Imported from capsule None
| author | devteam |
|---|---|
| date | Tue, 01 Apr 2014 10:52:42 -0400 |
| parents | |
| children | 6c6f15373f96 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8d65bbc52dfe |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Guruprasad Ananda | |
| 3 """ | |
| 4 Filter based on nucleotide quality (PHRED score). | |
| 5 | |
| 6 usage: %prog input out_file primary_species mask_species score mask_char mask_region mask_region_length | |
| 7 """ | |
| 8 | |
| 9 | |
| 10 from __future__ import division | |
| 11 from galaxy import eggs | |
| 12 import pkg_resources | |
| 13 pkg_resources.require( "lrucache" ) | |
| 14 import numpy | |
| 15 | |
| 16 import sys | |
| 17 import os, os.path | |
| 18 from UserDict import DictMixin | |
| 19 from bx.binned_array import FileBinnedArray | |
| 20 from bx.bitset import * | |
| 21 from bx.bitset_builders import * | |
| 22 from bx.cookbook import doc_optparse | |
| 23 from galaxy.tools.exception_handling import * | |
| 24 import bx.align.maf | |
| 25 | |
| 26 class FileBinnedArrayDir( DictMixin ): | |
| 27 """ | |
| 28 Adapter that makes a directory of FileBinnedArray files look like | |
| 29 a regular dict of BinnedArray objects. | |
| 30 """ | |
| 31 def __init__( self, dir ): | |
| 32 self.dir = dir | |
| 33 self.cache = dict() | |
| 34 def __getitem__( self, key ): | |
| 35 value = None | |
| 36 if key in self.cache: | |
| 37 value = self.cache[key] | |
| 38 else: | |
| 39 fname = os.path.join( self.dir, "%s.qa.bqv" % key ) | |
| 40 if os.path.exists( fname ): | |
| 41 value = FileBinnedArray( open( fname ) ) | |
| 42 self.cache[key] = value | |
| 43 if value is None: | |
| 44 raise KeyError( "File does not exist: " + fname ) | |
| 45 return value | |
| 46 | |
| 47 def stop_err(msg): | |
| 48 sys.stderr.write(msg) | |
| 49 sys.exit() | |
| 50 | |
| 51 def load_scores_ba_dir( dir ): | |
| 52 """ | |
| 53 Return a dict-like object (keyed by chromosome) that returns | |
| 54 FileBinnedArray objects created from "key.ba" files in `dir` | |
| 55 """ | |
| 56 return FileBinnedArrayDir( dir ) | |
| 57 | |
| 58 def bitwise_and ( string1, string2, maskch ): | |
| 59 result = [] | |
| 60 for i, ch in enumerate(string1): | |
| 61 try: | |
| 62 ch = int(ch) | |
| 63 except: | |
| 64 pass | |
| 65 if string2[i] == '-': | |
| 66 ch = 1 | |
| 67 if ch and string2[i]: | |
| 68 result.append(string2[i]) | |
| 69 else: | |
| 70 result.append(maskch) | |
| 71 return ''.join(result) | |
| 72 | |
| 73 def main(): | |
| 74 # Parsing Command Line here | |
| 75 options, args = doc_optparse.parse( __doc__ ) | |
| 76 | |
| 77 try: | |
| 78 #chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols ) | |
| 79 inp_file, out_file, pri_species, mask_species, qual_cutoff, mask_chr, mask_region, mask_length, loc_file = args | |
| 80 qual_cutoff = int(qual_cutoff) | |
| 81 mask_chr = int(mask_chr) | |
| 82 mask_region = int(mask_region) | |
| 83 if mask_region != 3: | |
| 84 mask_length = int(mask_length) | |
| 85 else: | |
| 86 mask_length_r = int(mask_length.split(',')[0]) | |
| 87 mask_length_l = int(mask_length.split(',')[1]) | |
| 88 except: | |
| 89 stop_err( "Data issue, click the pencil icon in the history item to correct the metadata attributes of the input dataset." ) | |
| 90 | |
| 91 if pri_species == 'None': | |
| 92 stop_err( "No primary species selected, try again by selecting at least one primary species." ) | |
| 93 if mask_species == 'None': | |
| 94 stop_err( "No mask species selected, try again by selecting at least one species to mask." ) | |
| 95 | |
| 96 mask_chr_count = 0 | |
| 97 mask_chr_dict = {0:'#', 1:'$', 2:'^', 3:'*', 4:'?', 5:'N'} | |
| 98 mask_reg_dict = {0:'Current pos', 1:'Current+Downstream', 2:'Current+Upstream', 3:'Current+Both sides'} | |
| 99 | |
| 100 #ensure dbkey is present in the twobit loc file | |
| 101 try: | |
| 102 pspecies_all = pri_species.split(',') | |
| 103 pspecies_all2 = pri_species.split(',') | |
| 104 pspecies = [] | |
| 105 filepaths = [] | |
| 106 for line in open(loc_file): | |
| 107 if pspecies_all2 == []: | |
| 108 break | |
| 109 if line[0:1] == "#": | |
| 110 continue | |
| 111 fields = line.split('\t') | |
| 112 try: | |
| 113 build = fields[0] | |
| 114 for i, dbkey in enumerate(pspecies_all2): | |
| 115 if dbkey == build: | |
| 116 pspecies.append(build) | |
| 117 filepaths.append(fields[1]) | |
| 118 del pspecies_all2[i] | |
| 119 else: | |
| 120 continue | |
| 121 except: | |
| 122 pass | |
| 123 except Exception, exc: | |
| 124 stop_err( 'Initialization errorL %s' % str( exc ) ) | |
| 125 | |
| 126 if len(pspecies) == 0: | |
| 127 stop_err( "Quality scores are not available for the following genome builds: %s" % ( pspecies_all2 ) ) | |
| 128 if len(pspecies) < len(pspecies_all): | |
| 129 print "Quality scores are not available for the following genome builds: %s" % (pspecies_all2) | |
| 130 | |
| 131 scores_by_chrom = [] | |
| 132 #Get scores for all the primary species | |
| 133 for file in filepaths: | |
| 134 scores_by_chrom.append(load_scores_ba_dir( file.strip() )) | |
| 135 | |
| 136 try: | |
| 137 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) | |
| 138 maf_writer = bx.align.maf.Writer( open(out_file,'w') ) | |
| 139 except Exception, e: | |
| 140 stop_err( "Your MAF file appears to be malformed: %s" % str( e ) ) | |
| 141 | |
| 142 maf_count = 0 | |
| 143 for block in maf_reader: | |
| 144 status_strings = [] | |
| 145 for seq in range (len(block.components)): | |
| 146 src = block.components[seq].src | |
| 147 dbkey = src.split('.')[0] | |
| 148 chr = src.split('.')[1] | |
| 149 if not (dbkey in pspecies): | |
| 150 continue | |
| 151 else: #enter if the species is a primary species | |
| 152 index = pspecies.index(dbkey) | |
| 153 sequence = block.components[seq].text | |
| 154 s_start = block.components[seq].start | |
| 155 size = len(sequence) #this includes the gaps too | |
| 156 status_str = '1'*size | |
| 157 status_list = list(status_str) | |
| 158 if status_strings == []: | |
| 159 status_strings.append(status_str) | |
| 160 ind = 0 | |
| 161 s_end = block.components[seq].end | |
| 162 #Get scores for the entire sequence | |
| 163 try: | |
| 164 scores = scores_by_chrom[index][chr][s_start:s_end] | |
| 165 except: | |
| 166 continue | |
| 167 pos = 0 | |
| 168 while pos < (s_end-s_start): | |
| 169 if sequence[ind] == '-': #No score for GAPS | |
| 170 ind += 1 | |
| 171 continue | |
| 172 score = scores[pos] | |
| 173 if score < qual_cutoff: | |
| 174 score = 0 | |
| 175 | |
| 176 if not(score): | |
| 177 if mask_region == 0: #Mask Corresponding position only | |
| 178 status_list[ind] = '0' | |
| 179 ind += 1 | |
| 180 pos += 1 | |
| 181 elif mask_region == 1: #Mask Corresponding position + downstream neighbors | |
| 182 for n in range(mask_length+1): | |
| 183 try: | |
| 184 status_list[ind+n] = '0' | |
| 185 except: | |
| 186 pass | |
| 187 ind = ind + mask_length + 1 | |
| 188 pos = pos + mask_length + 1 | |
| 189 elif mask_region == 2: #Mask Corresponding position + upstream neighbors | |
| 190 for n in range(mask_length+1): | |
| 191 try: | |
| 192 status_list[ind-n] = '0' | |
| 193 except: | |
| 194 pass | |
| 195 ind += 1 | |
| 196 pos += 1 | |
| 197 elif mask_region == 3: #Mask Corresponding position + neighbors on both sides | |
| 198 for n in range(-mask_length_l, mask_length_r+1): | |
| 199 try: | |
| 200 status_list[ind+n] = '0' | |
| 201 except: | |
| 202 pass | |
| 203 ind = ind + mask_length_r + 1 | |
| 204 pos = pos + mask_length_r + 1 | |
| 205 else: | |
| 206 pos += 1 | |
| 207 ind += 1 | |
| 208 | |
| 209 status_strings.append(''.join(status_list)) | |
| 210 | |
| 211 if status_strings == []: #this block has no primary species | |
| 212 continue | |
| 213 output_status_str = status_strings[0] | |
| 214 for stat in status_strings[1:]: | |
| 215 try: | |
| 216 output_status_str = bitwise_and (status_strings[0], stat, '0') | |
| 217 except Exception, e: | |
| 218 break | |
| 219 | |
| 220 for seq in range (len(block.components)): | |
| 221 src = block.components[seq].src | |
| 222 dbkey = src.split('.')[0] | |
| 223 if dbkey not in mask_species.split(','): | |
| 224 continue | |
| 225 sequence = block.components[seq].text | |
| 226 sequence = bitwise_and (output_status_str, sequence, mask_chr_dict[mask_chr]) | |
| 227 block.components[seq].text = sequence | |
| 228 mask_chr_count += output_status_str.count('0') | |
| 229 maf_writer.write(block) | |
| 230 maf_count += 1 | |
| 231 | |
| 232 maf_reader.close() | |
| 233 maf_writer.close() | |
| 234 print "No. of blocks = %d; No. of masked nucleotides = %s; Mask character = %s; Mask region = %s; Cutoff used = %d" % (maf_count, mask_chr_count, mask_chr_dict[mask_chr], mask_reg_dict[mask_region], qual_cutoff) | |
| 235 | |
| 236 | |
| 237 if __name__ == "__main__": | |
| 238 main() |
