Mercurial > repos > john-mccallum > pcr_markers
comparison vcf2gvf.py @ 5:b321e0517be3 draft
Uploaded
author | ben-warren |
---|---|
date | Thu, 22 May 2014 20:30:19 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:be070a68521e | 5:b321e0517be3 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Usage vcf2gvf.py <vcf_file input> <gff_file output> | |
4 """ | |
5 import sys,re | |
6 | |
7 ##simplify the variant descriptor for GVF | |
8 def get_vartype(type_str): | |
9 type_str=type_str.upper() | |
10 for V in ['COMPLEX', 'MNP', 'DEL', 'INDEL','INS','SNP']: | |
11 if V in type_str: | |
12 return V | |
13 break | |
14 | |
15 | |
16 | |
17 in_vcf_file = open(sys.argv[1], 'r') | |
18 out_gff_file = open(sys.argv[2], 'w') | |
19 out_gff_file.write("#gff-version 3\n") ####gvf-version 1.06 | |
20 | |
21 ##only deal with tidy source lines, samtools is special needs | |
22 source_pat = re.compile('##source=(\S+)', re.IGNORECASE) | |
23 | |
24 | |
25 while True: | |
26 line=in_vcf_file.readline() | |
27 if line[0]!='#': | |
28 break | |
29 elif line[2:10]=='samtools': | |
30 source_type='SAMTOOLS' | |
31 break | |
32 else: | |
33 m=source_pat.match(line) | |
34 if m: | |
35 source_type=m.group(1).upper() | |
36 break | |
37 | |
38 | |
39 ##now read the data | |
40 ##samtools only distinguishes SNPs and indels | |
41 ##This is much complicated in highly heterozygous systems | |
42 ##see http://www.sequenceontology.org/miso/current_release/term/SO:0001059 | |
43 ##GVF column 3 requires a type from this ontology | |
44 ###vcf type may be snp, mnp, ins, del, or complex. | |
45 ##for current purposes map multiple types to most complex state | |
46 ##ie SNP,SNP -> SNP; SNP,MNP-> MNP; SNP,complex | |
47 try: | |
48 for line in in_vcf_file: | |
49 if line[0] != '#': | |
50 var=line.split() ##see if var[7] has indel at start. If so add TYPE=INDEL for samtools case | |
51 info_field=var[7].split(';') | |
52 if info_field[0]=='INDEL': | |
53 info_field[0]="TYPE=INDEL" | |
54 info_dict=dict(X.split('=') for X in info_field) | |
55 ##if no TYPE key, then add TYPE=SNP for samtools case | |
56 if not info_dict.has_key('TYPE'): | |
57 info_dict['TYPE']='SNP'; | |
58 var_type=get_vartype(info_dict['TYPE']) | |
59 ID=":".join([var[0],source_type,var_type,var[1]]) | |
60 start=int(var[1]) | |
61 length=len(var[3]) ## using reference length in this case | |
62 attributes=";".join(['ID='+ID,'Reference_seq='+var[3],'Variant_seq='+var[4]]) | |
63 output_line=[var[0], source_type, var_type, str(start), str(start+length-1) ,'.','.','.',attributes,"\n"] | |
64 out_gff_file.write("\t".join([str(X) for X in output_line])) | |
65 finally: | |
66 in_vcf_file.close() | |
67 out_gff_file.close() | |
68 |