annotate cravat/cravat.py @ 0:9e29dd2972ab draft

Uploaded
author insilicosolutions
date Wed, 13 May 2015 15:24:18 -0400
parents
children c13857bac2c4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
1 import sys
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
2 import re
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
3 import requests
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
4
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
5 chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
6
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
7 def is_correct_input_line (line):
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
8 if chromosome_re.match(line) != None:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
9 return True
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
10 else:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
11 return False
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
12
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
13 def query (line):
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
14 url = query_url + '?mutation=' + '_'.join(line.split())
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
15 r = requests.get(url)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
16 annot = r.json()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
17 return annot
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
18
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
19 query_url = 'http://staging.cravat.us/rest/service/query'
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
20
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
21 first_headers = ['ID',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
22 'Chromosome',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
23 'Position',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
24 'Strand',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
25 'Reference base(s)',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
26 'Alternate base(s)',
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
27 'Sample']
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
28
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
29 input_filename = sys.argv[1]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
30 output_filename = sys.argv[2]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
31
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
32 headers = []
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
33 header_not_loaded = True
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
34
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
35 f = open(input_filename)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
36 wf = open(output_filename, 'w')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
37 for line in f:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
38 if is_correct_input_line(line) == False:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
39 print 'Wrong format line:' + line[:-1]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
40 continue
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
41
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
42 toks = line[:-1].split()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
43 uid = toks[0]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
44 if len(toks) >= 7:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
45 sample_id = toks[6]
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
46 else:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
47 sample_id = 'Unknown'
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
48 annot = query(' '.join(toks[1:]))
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
49 if header_not_loaded:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
50 headers = annot.keys()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
51 headers.sort()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
52 wf.write('\t'.join(first_headers))
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
53 for header in headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
54 if header not in first_headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
55 wf.write('\t' + header)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
56 wf.write('\n')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
57 header_not_loaded = False
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
58 wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
59 for header in headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
60 if header not in first_headers:
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
61 wf.write('\t' + annot[header])
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
62 wf.write('\n')
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
63 f.close()
9e29dd2972ab Uploaded
insilicosolutions
parents:
diff changeset
64 wf.close()