diff cravat/cravat.py @ 0:9e29dd2972ab draft

Uploaded
author insilicosolutions
date Wed, 13 May 2015 15:24:18 -0400
parents
children c13857bac2c4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cravat/cravat.py	Wed May 13 15:24:18 2015 -0400
@@ -0,0 +1,64 @@
+import sys
+import re
+import requests
+
+chromosome_re = re.compile('[0-9a-zA-Z_:]+\s+(chr[1-9]|chr1[0-9]|chr2[0-2]|chr[XY])\s+[0-9]+\s+[+-]\s+([ATGC]+|-)\s+([ATGC]+|-)', re.IGNORECASE)
+
+def is_correct_input_line (line):
+    if chromosome_re.match(line) != None:
+        return True
+    else:
+        return False
+
+def query (line):
+    url = query_url + '?mutation=' + '_'.join(line.split())
+    r = requests.get(url)
+    annot = r.json()
+    return annot
+
+query_url = 'http://staging.cravat.us/rest/service/query'
+
+first_headers = ['ID',
+                 'Chromosome',
+                 'Position', 
+                 'Strand', 
+                 'Reference base(s)', 
+                 'Alternate base(s)',
+                 'Sample']
+
+input_filename = sys.argv[1]
+output_filename = sys.argv[2]
+
+headers = []
+header_not_loaded = True
+
+f = open(input_filename)
+wf = open(output_filename, 'w')
+for line in f:
+    if is_correct_input_line(line) == False:
+        print 'Wrong format line:' + line[:-1]
+        continue
+
+    toks = line[:-1].split()
+    uid = toks[0]
+    if len(toks) >= 7:
+        sample_id = toks[6]
+    else:
+        sample_id = 'Unknown'
+    annot = query(' '.join(toks[1:]))
+    if header_not_loaded:
+        headers = annot.keys()
+        headers.sort()
+        wf.write('\t'.join(first_headers))
+        for header in headers:
+            if header not in first_headers:
+                wf.write('\t' + header)
+        wf.write('\n')
+        header_not_loaded = False
+    wf.write('\t'.join(toks[:6]) + '\t' + sample_id)
+    for header in headers:
+        if header not in first_headers:
+            wf.write('\t' + annot[header])
+    wf.write('\n')
+f.close()
+wf.close()