annotate vcf2gvf.py @ 6:f201e8c6e004 draft default tip

Uploaded
author ben-warren
date Mon, 07 Jul 2014 19:28:17 -0400
parents b321e0517be3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
1 #!/usr/bin/env python
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
2 """
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
3 Usage vcf2gvf.py <vcf_file input> <gff_file output>
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
4 """
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
5 import sys,re
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
6
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
7 ##simplify the variant descriptor for GVF
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
8 def get_vartype(type_str):
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
9 type_str=type_str.upper()
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
10 for V in ['COMPLEX', 'MNP', 'DEL', 'INDEL','INS','SNP']:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
11 if V in type_str:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
12 return V
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
13 break
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
14
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
15
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
16
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
17 in_vcf_file = open(sys.argv[1], 'r')
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
18 out_gff_file = open(sys.argv[2], 'w')
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
19 out_gff_file.write("#gff-version 3\n") ####gvf-version 1.06
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
20
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
21 ##only deal with tidy source lines, samtools is special needs
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
22 source_pat = re.compile('##source=(\S+)', re.IGNORECASE)
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
23
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
24
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
25 while True:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
26 line=in_vcf_file.readline()
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
27 if line[0]!='#':
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
28 break
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
29 elif line[2:10]=='samtools':
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
30 source_type='SAMTOOLS'
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
31 break
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
32 else:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
33 m=source_pat.match(line)
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
34 if m:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
35 source_type=m.group(1).upper()
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
36 break
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
37
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
38
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
39 ##now read the data
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
40 ##samtools only distinguishes SNPs and indels
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
41 ##This is much complicated in highly heterozygous systems
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
42 ##see http://www.sequenceontology.org/miso/current_release/term/SO:0001059
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
43 ##GVF column 3 requires a type from this ontology
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
44 ###vcf type may be snp, mnp, ins, del, or complex.
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
45 ##for current purposes map multiple types to most complex state
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
46 ##ie SNP,SNP -> SNP; SNP,MNP-> MNP; SNP,complex
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
47 try:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
48 for line in in_vcf_file:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
49 if line[0] != '#':
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
50 var=line.split() ##see if var[7] has indel at start. If so add TYPE=INDEL for samtools case
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
51 info_field=var[7].split(';')
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
52 if info_field[0]=='INDEL':
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
53 info_field[0]="TYPE=INDEL"
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
54 info_dict=dict(X.split('=') for X in info_field)
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
55 ##if no TYPE key, then add TYPE=SNP for samtools case
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
56 if not info_dict.has_key('TYPE'):
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
57 info_dict['TYPE']='SNP';
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
58 var_type=get_vartype(info_dict['TYPE'])
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
59 ID=":".join([var[0],source_type,var_type,var[1]])
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
60 start=int(var[1])
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
61 length=len(var[3]) ## using reference length in this case
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
62 attributes=";".join(['ID='+ID,'Reference_seq='+var[3],'Variant_seq='+var[4]])
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
63 output_line=[var[0], source_type, var_type, str(start), str(start+length-1) ,'.','.','.',attributes,"\n"]
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
64 out_gff_file.write("\t".join([str(X) for X in output_line]))
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
65 finally:
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
66 in_vcf_file.close()
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
67 out_gff_file.close()
b321e0517be3 Uploaded
ben-warren
parents:
diff changeset
68