Mercurial > repos > jaredgk > ppp_vcfphase
comparison bcftools.py @ 5:86a9d8d5b291 draft default tip
Uploaded
| author | jaredgk |
|---|---|
| date | Wed, 17 Oct 2018 17:34:34 -0400 |
| parents | 3830d29fca6a |
| children |
comparison
equal
deleted
inserted
replaced
| 4:901857c9b24f | 5:86a9d8d5b291 |
|---|---|
| 5 | 5 |
| 6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) | 6 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared'))) |
| 7 | 7 |
| 8 from vcf_reader_func import checkFormat | 8 from vcf_reader_func import checkFormat |
| 9 | 9 |
| 10 def return_output_format_args (output_format): | |
| 11 ''' | |
| 12 Return bcftools arguments for output format | |
| 13 | |
| 14 Parameters | |
| 15 ---------- | |
| 16 output_format : str | |
| 17 The specified output format | |
| 18 | |
| 19 Raises | |
| 20 ------ | |
| 21 Exception | |
| 22 If output format is unsupported by bcftools | |
| 23 ''' | |
| 24 | |
| 25 # Return the output format arguments | |
| 26 if output_format == 'vcf': | |
| 27 return ['-O', 'v'] | |
| 28 elif output_format == 'bcf': | |
| 29 return ['-O', 'b'] | |
| 30 elif output_format == 'vcf.gz': | |
| 31 return ['-O', 'z'] | |
| 32 else: | |
| 33 raise Exception('Unsupported file format') | |
| 34 | |
| 10 def check_bcftools_for_errors (bcftools_stderr): | 35 def check_bcftools_for_errors (bcftools_stderr): |
| 11 ''' | 36 ''' |
| 12 Checks the bgzip stderr for errors | 37 Checks the bgzip stderr for errors |
| 13 | 38 |
| 14 Parameters | 39 Parameters |
| 16 bcftools_stderr : str | 41 bcftools_stderr : str |
| 17 bcftools stderr | 42 bcftools stderr |
| 18 | 43 |
| 19 Raises | 44 Raises |
| 20 ------ | 45 ------ |
| 21 IOError | 46 Exception |
| 22 If bcftools stderr returns an error | 47 If bcftools stderr returns an error |
| 23 ''' | 48 ''' |
| 24 | 49 |
| 25 # Expand as errors are discovered | 50 # Expand as errors are discovered |
| 26 if bcftools_stderr: | 51 |
| 27 logging.error(vcftools_stderr) | 52 # Log warning messages |
| 28 raise Exception(vcftools_stderr) | 53 if 'W::' in bcftools_stderr: |
| 29 | 54 logging.warning(bcftools_stderr.strip()) |
| 30 def call_bcftools (bcftools_call_args): | 55 |
| 56 # Report errors that are not warnings | |
| 57 elif bcftools_stderr: | |
| 58 raise Exception(bcftools_stderr) | |
| 59 | |
| 60 def pipe_bcftools (bcftools_call_args): | |
| 61 ''' | |
| 62 Calls bcftools with pipe output | |
| 63 | |
| 64 The output of this function is the stdout and stderr of bcftools. This | |
| 65 function should only be used if bcftools is being used as the stdin of | |
| 66 another function. Please note that this function does not check the for | |
| 67 errors in the bcftools call. Please check for errors after the call is | |
| 68 closed using check_bcftools_for_errors. | |
| 69 | |
| 70 Parameters | |
| 71 ---------- | |
| 72 bcftools_stderr : str | |
| 73 bcftools stderr | |
| 74 | |
| 75 Returns | |
| 76 ------- | |
| 77 bcftools_call : PIPE | |
| 78 Pipe of subprocess call, including both stdout and stderr | |
| 79 | |
| 80 ''' | |
| 31 | 81 |
| 32 # bcftools subprocess call | 82 # bcftools subprocess call |
| 33 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 83 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 34 | 84 |
| 85 return bcftools_call | |
| 86 | |
| 87 def pipe_bcftools_to_chr (vcf_filename): | |
| 88 ''' | |
| 89 Pipes chromosome and/or contig output of bcftools to a list of unique | |
| 90 entries | |
| 91 | |
| 92 The purpose of this function is to return a list of the unique | |
| 93 chromosomes and/or contigs for use in other functions. | |
| 94 | |
| 95 Parameters | |
| 96 ---------- | |
| 97 vcf_filename : str | |
| 98 VCF input | |
| 99 | |
| 100 Returns | |
| 101 ------- | |
| 102 chromosomes_to_return : list | |
| 103 Unique chromosomes and/or contigs within VCF input | |
| 104 ''' | |
| 105 | |
| 106 # Open bcftools pipe | |
| 107 bcftools_call = pipe_bcftools(['query', '-f', '%CHROM\n', vcf_filename]) | |
| 108 | |
| 109 # Create a set to hold unique chromosome | |
| 110 chromosomes_to_return = set() | |
| 111 | |
| 112 try: | |
| 113 | |
| 114 # Current chromosomes/contigs, reduces duplicates if VCF is sorted | |
| 115 previous_chr = None | |
| 116 | |
| 117 # Iterate the bcftools stdout unless error occurs | |
| 118 for bcftools_stdout_line in iter(bcftools_call.stdout.readline, b''): | |
| 119 # Remove the newline character | |
| 120 bcftools_line_chr = bcftools_stdout_line.strip() | |
| 121 # Check if the bcftools bcftools chr is different from stored chr | |
| 122 if bcftools_line_chr != previous_chr: | |
| 123 # Store the new chr for comparisons to reduce duplicates | |
| 124 previous_chr = bcftools_line_chr | |
| 125 # Save the chr | |
| 126 chromosomes_to_return.add(bcftools_line_chr) | |
| 127 | |
| 128 except: | |
| 129 raise Exception('bcftools call error') | |
| 130 | |
| 131 # Close the bcftools stdout | |
| 132 bcftools_call.stdout.close() | |
| 133 | |
| 134 # Wait for bctools to finish | |
| 135 bcftools_call.wait() | |
| 136 | |
| 137 # Read the bcftools stderr | |
| 138 bcftools_stderr = bcftools_call.stderr.read() | |
| 139 | |
| 140 # Check if code is running in python 3 | |
| 141 if sys.version_info[0] == 3: | |
| 142 # Convert bytes to string | |
| 143 bcftools_stderr = bcftools_stderr.decode() | |
| 144 | |
| 145 # Check that the log file was created correctly | |
| 146 check_bcftools_for_errors(bcftools_stderr) | |
| 147 | |
| 148 logging.info('bcftools call complete') | |
| 149 | |
| 150 return list(chromosomes_to_return) | |
| 151 | |
| 152 def call_bcftools (bcftools_call_args): | |
| 153 ''' | |
| 154 Calls bcftools | |
| 155 | |
| 156 The function calls bcftools. | |
| 157 | |
| 158 Parameters | |
| 159 ---------- | |
| 160 bcftools_call_args : list | |
| 161 bcftools arguments | |
| 162 | |
| 163 Returns | |
| 164 ------- | |
| 165 vcftools_err : str | |
| 166 vcftools log output | |
| 167 | |
| 168 Raises | |
| 169 ------ | |
| 170 Exception | |
| 171 If bcftools stderr returns an error | |
| 172 ''' | |
| 173 | |
| 174 # bcftools subprocess call | |
| 175 bcftools_call = subprocess.Popen(['bcftools'] + list(map(str, bcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| 176 | |
| 35 # Wait for bcftools to finish | 177 # Wait for bcftools to finish |
| 36 bcftools_out, bcftools_err = bcftools_call.communicate() | 178 bcftools_stdout, bcftools_stderr = bcftools_call.communicate() |
| 37 | 179 |
| 38 check_bcftools_for_errors(bcftools_err) | 180 # Check if code is running in python 3 |
| 181 if sys.version_info[0] == 3: | |
| 182 # Convert bytes to string | |
| 183 bcftools_stderr = bcftools_stderr.decode() | |
| 184 | |
| 185 check_bcftools_for_errors(bcftools_stderr) | |
| 39 | 186 |
| 40 logging.info('bcftools call complete') | 187 logging.info('bcftools call complete') |
| 41 | 188 |
| 189 return bcftools_stderr | |
| 190 | |
| 42 def check_for_index (filename): | 191 def check_for_index (filename): |
| 192 ''' | |
| 193 Checks for index file | |
| 194 | |
| 195 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) the | |
| 196 function will return either True (i.e. index found) or False. However, | |
| 197 if the file is a VCF the function will return None (as VCF files cannot | |
| 198 have an index). An error is returned if the file is either a | |
| 199 gzipped-VCF file or not a VCF-based format. | |
| 200 | |
| 201 Parameters | |
| 202 ---------- | |
| 203 filename : str | |
| 204 Filename of VCF-formatted file | |
| 205 | |
| 206 Returns | |
| 207 ------- | |
| 208 bool, None | |
| 209 Returns bool for VCF.GZ and BCF files. Returns None for VCF files | |
| 210 | |
| 211 Raises | |
| 212 ------ | |
| 213 Exception | |
| 214 If the file is a gzipped-VCF or of an unknown format | |
| 215 ''' | |
| 43 | 216 |
| 44 # Assign the file format | 217 # Assign the file format |
| 45 file_format = checkFormat(filename) | 218 file_format = checkFormat(filename) |
| 46 | 219 |
| 47 # Check if the file to be indexed is a vcf.gz | 220 # Check if the file to be indexed is a vcf.gz |
| 54 elif file_format == 'bcf': | 227 elif file_format == 'bcf': |
| 55 # Check if the index (.csi) exists | 228 # Check if the index (.csi) exists |
| 56 if os.path.isfile(filename + '.csi'): | 229 if os.path.isfile(filename + '.csi'): |
| 57 return True | 230 return True |
| 58 | 231 |
| 232 # Check if the file is vcf (does not need an index) | |
| 233 elif file_format == 'vcf': | |
| 234 return None | |
| 235 | |
| 236 # Check if the file is gzip-compressed vcf (cannot have an index) | |
| 237 elif file_format == 'gzip': | |
| 238 raise Exception('GZIP-compressed VCF files do not support index files.') | |
| 239 | |
| 240 # Check if the file is an unknown format | |
| 241 else: | |
| 242 raise Exception('Unknown file format') | |
| 243 | |
| 59 # Return false if no index is found | 244 # Return false if no index is found |
| 60 return False | 245 return False |
| 61 | 246 |
| 247 def delete_index (filename): | |
| 248 ''' | |
| 249 Deletes an index file | |
| 250 | |
| 251 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) | |
| 252 this function will delete the index. However, if the file is either a | |
| 253 VCF or a gzip-compressed VCF the function will return an error. The | |
| 254 function also results in an error if the index cannot be found. This | |
| 255 function should be used following check_for_index. | |
| 256 | |
| 257 Parameters | |
| 258 ---------- | |
| 259 filename : str | |
| 260 Filename of VCF-formatted file | |
| 261 | |
| 262 Raises | |
| 263 ------ | |
| 264 Exception | |
| 265 No index file could be found | |
| 266 Exception | |
| 267 If the file is a gzipped-VCF or a VCF | |
| 268 ''' | |
| 269 | |
| 270 # Assign the file format | |
| 271 file_format = checkFormat(filename) | |
| 272 | |
| 273 # Check if the file to be indexed is a vcf.gz | |
| 274 if file_format == 'bgzip': | |
| 275 # Check if the index (.tbi) exists | |
| 276 if os.path.isfile(filename + '.tbi'): | |
| 277 # Delete the index | |
| 278 os.remove(filename + '.tbi') | |
| 279 return | |
| 280 | |
| 281 # Check if the file to be indexed is a bcf | |
| 282 elif file_format == 'bcf': | |
| 283 # Check if the index (.csi) exists | |
| 284 if os.path.isfile(filename + '.csi'): | |
| 285 # Delete the index | |
| 286 os.remove(filename + '.csi') | |
| 287 return | |
| 288 | |
| 289 # Check if the file is vcf (cannot have an index) | |
| 290 elif file_format == 'vcf': | |
| 291 raise Exception('VCF format does not support index files.') | |
| 292 | |
| 293 # Check if the file is gzip-compressed vcf (cannot have an index) | |
| 294 elif file_format == 'gzip': | |
| 295 raise Exception('GZIP-compressed VCF files do not support index files.') | |
| 296 | |
| 297 # Return error if no index is found | |
| 298 raise Exception('No index file found.') | |
| 299 | |
| 62 def create_index (filename): | 300 def create_index (filename): |
| 301 ''' | |
| 302 Creates an index file | |
| 303 | |
| 304 If the file is capable of having an index (i.e. bgzipped-VCF or BCF) | |
| 305 this function will create an index file. However, if the file is a | |
| 306 different format the function will return an error. | |
| 307 | |
| 308 Parameters | |
| 309 ---------- | |
| 310 filename : str | |
| 311 Filename of VCF-formatted file | |
| 312 | |
| 313 Raises | |
| 314 ------ | |
| 315 Exception | |
| 316 If the file is not a bgzipped-VCF or BCF | |
| 317 ''' | |
| 63 | 318 |
| 64 # Assign the file format | 319 # Assign the file format |
| 65 file_format = checkFormat(filename) | 320 file_format = checkFormat(filename) |
| 66 | 321 |
| 67 # Check if the file to be indexed is a vcf.gz | 322 # Check if the file to be indexed is a vcf.gz |
| 76 | 331 |
| 77 # Report if file cannot be indexed | 332 # Report if file cannot be indexed |
| 78 else: | 333 else: |
| 79 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) | 334 raise Exception('Error creating index for: %s. Only .bcf and .vcf.gz (bgzip) files are supported.' % filename) |
| 80 | 335 |
| 81 def convert_to_bcf (filename, output_prefix): | 336 def chr_subset_file (filename, chromosome, output_prefix, output_format, from_bp = None, to_bp = None): |
| 337 ''' | |
| 338 Creates chromosome subset | |
| 339 | |
| 340 This function is used to create a VCF-formatted subset with only | |
| 341 the data from a single chromosome. | |
| 342 | |
| 343 Parameters | |
| 344 ---------- | |
| 345 filename : str | |
| 346 Filename of VCF-formatted input | |
| 347 chromosome : str | |
| 348 Chromosome to subset | |
| 349 output_prefix : str | |
| 350 Prefix of the VCF-formatted output (i.e. without file extension) | |
| 351 output_format : str | |
| 352 The format of the output (e.g. vcf, bcf, vcf.gz) | |
| 353 from_bp : int, optional | |
| 354 Lower bound of sites to include | |
| 355 to_bp : int, optional | |
| 356 Upper bound of sites to include | |
| 357 ''' | |
| 358 | |
| 359 # Creates a list to the arguments and store the bcftools call | |
| 360 subset_args = ['view'] | |
| 361 | |
| 362 # Assign the output format arguments | |
| 363 output_format_args = return_output_format_args(output_format) | |
| 364 | |
| 365 # Store the output format arguments | |
| 366 subset_args.extend(output_format_args) | |
| 367 | |
| 368 # Stores the specified output filename | |
| 369 vcf_output = '%s.%s' % (output_prefix, output_format) | |
| 370 | |
| 371 # Assigns the output file to the arguments | |
| 372 subset_args.extend(['-o', vcf_output]) | |
| 373 | |
| 374 # Holds the subset argument | |
| 375 chr_subet_arg = chromosome | |
| 376 | |
| 377 # Check if either bp position arguments were specified | |
| 378 if from_bp or to_bp: | |
| 379 | |
| 380 # List of the position arguments, in their required order | |
| 381 position_args = [':', from_bp, '-', to_bp] | |
| 382 | |
| 383 # Filter the position arguments to remove empty values | |
| 384 filttered_position_args = filter(None, position_args) | |
| 385 | |
| 386 # Map the arguments to str and add them to the chromosome argument | |
| 387 chr_subet_arg += ''.join(map(str, filttered_position_args)) | |
| 388 | |
| 389 # Checks if the input file has an index, then subset to the arguments | |
| 390 if check_for_index(filename): | |
| 391 # Subsets using the index | |
| 392 subset_args.extend(['-r', chr_subet_arg]) | |
| 393 else: | |
| 394 # Subsets using the stdout | |
| 395 subset_args.extend(['-t', chr_subet_arg]) | |
| 396 | |
| 397 # Assigns the input file to the arguments | |
| 398 subset_args.append(filename) | |
| 399 | |
| 400 # Call bcftools | |
| 401 call_bcftools(subset_args) | |
| 402 | |
| 403 def concatenate (filenames, output_prefix, output_format, keep_original = False): | |
| 404 ''' | |
| 405 Concatenate multiple VCF-formatted files | |
| 406 | |
| 407 This function will concatenate multiple VCF-formatted files into a | |
| 408 single VCF-formatted file of the specifed format. | |
| 409 | |
| 410 Parameters | |
| 411 ---------- | |
| 412 filenames : list | |
| 413 List of VCF-formatted input filenames | |
| 414 output_prefix : str | |
| 415 Prefix of the VCF-formatted output (i.e. without file extension) | |
| 416 output_format : str | |
| 417 The format of the output (e.g. vcf, bcf, vcf.gz) | |
| 418 ''' | |
| 419 | |
| 420 # Holds the arguments to convert to VCF format | |
| 421 concat_args = ['concat'] | |
| 422 | |
| 423 # Assign the output format arguments | |
| 424 output_format_args = return_output_format_args(output_format) | |
| 425 | |
| 426 # Store the output format arguments | |
| 427 concat_args.extend(output_format_args) | |
| 428 | |
| 429 # Stores the specified output filename | |
| 430 vcf_output = '%s.%s' % (output_prefix, output_format) | |
| 431 | |
| 432 # Assigns the output file to the arguments | |
| 433 concat_args.extend(['-o', vcf_output]) | |
| 434 | |
| 435 # Assigns the input files to merge | |
| 436 concat_args.extend(filenames) | |
| 437 | |
| 438 # Call bcftools | |
| 439 call_bcftools(concat_args) | |
| 440 | |
| 441 # Delete the original files once the merged file is created | |
| 442 if not keep_original: | |
| 443 for filename in filenames: | |
| 444 if check_for_index(filename) == True: | |
| 445 delete_index(filename) | |
| 446 os.remove(filename) | |
| 447 | |
| 448 def convert_to_bcf (filename, output_prefix, keep_original = False): | |
| 449 ''' | |
| 450 Converts a VCF-formatted file to BCF | |
| 451 | |
| 452 This function will convert a VCF-formatted file to BCF with the | |
| 453 specified filename prefix. The function also has the option to keep or | |
| 454 delete the input file once the BCF file has been created. | |
| 455 | |
| 456 Parameters | |
| 457 ---------- | |
| 458 filename : str | |
| 459 Filename of VCF-formatted input | |
| 460 output_prefix : str | |
| 461 Prefix of the BCF output (i.e. without file extension) | |
| 462 keep_original : bool, optional | |
| 463 If the input file should be kept once converted | |
| 464 ''' | |
| 82 | 465 |
| 83 # Holds the arguments to convert to BCF format | 466 # Holds the arguments to convert to BCF format |
| 84 convert_args = ['convert', '-O', 'b'] | 467 convert_args = ['convert', '-O', 'b'] |
| 85 | 468 |
| 86 # Stores the specified output_prefix to the BCF file | 469 # Stores the specified output_prefix to the BCF file |
| 93 convert_args.append(filename) | 476 convert_args.append(filename) |
| 94 | 477 |
| 95 # Call bcftools | 478 # Call bcftools |
| 96 call_bcftools(convert_args) | 479 call_bcftools(convert_args) |
| 97 | 480 |
| 98 | 481 # Delete the original file once the bcf file is created |
| 99 def convert_to_vcf (filename, output_prefix): | 482 if not keep_original: |
| 483 if check_for_index(filename) == True: | |
| 484 delete_index(filename) | |
| 485 os.remove(filename) | |
| 486 | |
| 487 def convert_to_vcf (filename, output_prefix, keep_original = False): | |
| 488 ''' | |
| 489 Converts a VCF-formatted file to VCF | |
| 490 | |
| 491 This function will convert a VCF-formatted file to VCF with the | |
| 492 specified filename prefix. The function also has the option to keep or | |
| 493 delete the input file once the VCF file has been created. | |
| 494 | |
| 495 Parameters | |
| 496 ---------- | |
| 497 filename : str | |
| 498 Filename of VCF-formatted input | |
| 499 output_prefix : str | |
| 500 Prefix of the VCF output (i.e. without file extension) | |
| 501 keep_original : bool, optional | |
| 502 If the input file should be kept once converted | |
| 503 ''' | |
| 100 | 504 |
| 101 # Holds the arguments to convert to VCF format | 505 # Holds the arguments to convert to VCF format |
| 102 convert_args = ['view', '-O', 'v'] | 506 convert_args = ['view', '-O', 'v'] |
| 103 | 507 |
| 104 # Stores the specified output_prefix to the VCF file | 508 # Stores the specified output_prefix to the VCF file |
| 111 convert_args.append(filename) | 515 convert_args.append(filename) |
| 112 | 516 |
| 113 # Call bcftools | 517 # Call bcftools |
| 114 call_bcftools(convert_args) | 518 call_bcftools(convert_args) |
| 115 | 519 |
| 116 def convert_to_vcfgz (filename, output_prefix): | 520 # Delete the original file once the vcf file is created |
| 521 if not keep_original: | |
| 522 if check_for_index(filename) == True: | |
| 523 delete_index(filename) | |
| 524 os.remove(filename) | |
| 525 | |
| 526 def convert_to_vcfgz (filename, output_prefix, keep_original = False): | |
| 527 ''' | |
| 528 Converts a VCF-formatted file to bgzipped-VCF | |
| 529 | |
| 530 This function will convert a VCF-formatted file to bgzipped-VCF with the | |
| 531 specified filename prefix. The function also has the option to keep or | |
| 532 delete the input file once the bgzipped-VCF file has been created. | |
| 533 | |
| 534 Parameters | |
| 535 ---------- | |
| 536 filename : str | |
| 537 Filename of VCF-formatted input | |
| 538 output_prefix : str | |
| 539 Prefix of the bgzipped-VCF output (i.e. without file extension) | |
| 540 keep_original : bool, optional | |
| 541 If the input file should be kept once converted | |
| 542 ''' | |
| 117 | 543 |
| 118 # Holds the arguments to convert to VCFGZ format | 544 # Holds the arguments to convert to VCFGZ format |
| 119 convert_args = ['view', '-O', 'z'] | 545 convert_args = ['view', '-O', 'z'] |
| 120 | 546 |
| 121 # Stores the specified output_prefix to the VCFGZ file | 547 # Stores the specified output_prefix to the VCFGZ file |
| 127 # Assigns the specified input to the arguments | 553 # Assigns the specified input to the arguments |
| 128 convert_args.append(filename) | 554 convert_args.append(filename) |
| 129 | 555 |
| 130 # Call bcftools | 556 # Call bcftools |
| 131 call_bcftools(convert_args) | 557 call_bcftools(convert_args) |
| 558 | |
| 559 # Delete the original file once the vcfgz file is created | |
| 560 if not keep_original: | |
| 561 if check_for_index(filename) == True: | |
| 562 delete_index(filename) | |
| 563 os.remove(filename) |
