Mercurial > repos > in_silico > cravat_vcf_convert
view cravat_convert/cravat_convert.py @ 20:a0b39b857c7a draft
Uploaded
author | in_silico |
---|---|
date | Wed, 18 Jul 2018 10:17:38 -0400 |
parents | |
children |
line wrap: on
line source
''' Convert a VCF format file to Cravat format file ''' import os import argparse from vcf_converter import CravatConverter from __future__ import print_function def get_vcf_mapping(): """ : VCF Headers mapped to their index position in a row of VCF values. : These are only the mandatory columns, per the VCF spec. """ return { 'CHROM': 0, 'POS': 1, 'ID': 2, 'REF': 3, 'ALT': 4, 'QUAL': 5, 'FILTER': 6, 'INFO': 7 } def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--input', '-i', required = True, help='Input path to a VCF file for conversion',) parser.add_argument('--output', '-o', default = None, help = 'Output path to write the cravat file to') return parser.parse_args() def convert(in_path, out_path=None, cr_sep='\t', cr_newline='\n'): """ : Convert a VCF file to a Cravat file. : Arguments: : in_path: <str> path to input vcf file : out_path: <str> path to output cravat file. Will defualt to cravat_converted.txt in the input directory. : cr_sep: <str> the value delimiter for the output cravat file. Default value of '\\t'. : out_newline: <str> the newline delimiter in the output cravat file. Default of '\\n' """ if not out_path: base, _ = os.path.split(in_path) out_path = os.path.join(base, "cravat_converted.txt") with open(in_path, 'r') as in_file, \ open(out_path, 'w') as out_file: # cr_count will be used to generate the 'TR' field of the cravat rows (first header) cr_count = 0 # VCF lines are always assumed to be '+' strand, as VCF doesn't specify that attribute strand = '+' # VCF converter. Adjusts position, reference, and alternate for Cravat formatting. converter = CravatConverter() # A dictionary of mandatory vcf headers mapped to their row indices vcf_mapping = get_vcf_mapping() for line in in_file: if line.startswith("#"): continue line = line.strip().split() # row is dict of VCF headers mapped to corresponding values of this line row = { header: line[index] for header, index in vcf_mapping.items() } for alt in row["ALT"].split(","): new_pos, new_ref, new_alt = converter.extract_vcf_variant(strand, row["POS"], row["REF"], alt) new_pos, new_ref, new_alt = str(new_pos), str(new_ref), str(new_alt) cr_line = cr_sep.join([ 'TR' + str(cr_count), row['CHROM'], new_pos, strand, new_ref, new_alt, row['ID'] ]) out_file.write(cr_line + cr_newline) cr_count += 1 if __name__ == "__main__": cli_args = get_args() if cli_args.output == None: base, _ = os.path.split(cli_args.input) cli_args.output = os.path.join(base, "cravat_converted.txt") convert(in_path = cli_args.input, out_path = cli_args.output)