Mercurial > repos > in_silico > cravat_vcf_convert
view cravat_convert/cravat_convert.py @ 5:7048ccf0ff7b draft
Uploaded
author | in_silico |
---|---|
date | Tue, 12 Jun 2018 11:27:06 -0400 |
parents | b968ba302ba6 |
children |
line wrap: on
line source
''' Convert a VCF format file to Cravat format file ''' import os import argparse from vcf_converter import CravatConverter # File read/write configuration variables vcf_sep = '\t' cr_sep = '\t' cr_newline = '\n' # VCF Headers mapped to their index position in a row of VCF values vcf_mapping = { 'CHROM': 0, 'POS': 1, 'ID': 2, 'REF': 3, 'ALT': 4, 'QUAL': 5, 'FILTER': 6, 'INFO': 7, 'FORMAT': 8, 'NA00001': 9, 'NA00002': 10, 'NA00003': 11 } def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--input', '-i', required = True, help='Input path to a VCF file for conversion',) parser.add_argument('--output', '-o', default = os.path.join(os.getcwd(), "cravat_converted.txt"), help = 'Output path to write the cravat file to') return parser.parse_args() def convert(in_path, out_path=None): if not out_path: base, _ = os.path.split(in_path) out_path = os.path.join(base, "cravat_converted.txt") with open(in_path, 'r') as in_file, \ open(out_path, 'w') as out_file: # cr_count will be used to generate the 'TR' field of the cravat rows (first header) cr_count = 0 # VCF lines are always assumed to be '+' strand, as VCF doesn't specify that attribute strand = '+' # VCF converter. Adjusts position, reference, and alternate for Cravat formatting. converter = CravatConverter() for line in in_file: if line.startswith("#"): continue line = line.strip().split(vcf_sep) # row is dict of VCF headers mapped to corresponding values of this line row = { header: line[index] for header, index in vcf_mapping.items() } for alt in row["ALT"].split(","): new_pos, new_ref, new_alt = converter.extract_vcf_variant(strand, row["POS"], row["REF"], alt) new_pos, new_ref, new_alt = str(new_pos), str(new_ref), str(new_alt) cr_line = cr_sep.join([ 'TR' + str(cr_count), row['CHROM'], new_pos, strand, new_ref, new_alt, row['ID'] ]) out_file.write(cr_line + cr_newline) cr_count += 1 if __name__ == "__main__": cli_args = get_args() convert(cli_args.input, cli_args.output)