Mercurial > repos > galaxyp > cravatool
comparison cravatp_submit.py @ 1:2c7bcc1219fc draft
Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
| author | galaxyp |
|---|---|
| date | Thu, 16 Aug 2018 12:27:35 -0400 |
| parents | |
| children | a018c44dc18b |
comparison
equal
deleted
inserted
replaced
| 0:83181dabeb90 | 1:2c7bcc1219fc |
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 # | |
| 3 # Author: Ray W. Sajulga | |
| 4 # | |
| 5 # | |
| 6 | |
| 7 import requests # pipenv requests | |
| 8 import json | |
| 9 import time | |
| 10 import urllib | |
| 11 import sys | |
| 12 import csv | |
| 13 import re | |
| 14 import math | |
| 15 import argparse | |
| 16 from xml.etree import ElementTree as ET | |
| 17 from zipfile import ZipFile | |
| 18 try: #Python 3 | |
| 19 from urllib.request import urlopen | |
| 20 except ImportError: #Python 2 | |
| 21 from urllib2 import urlopen | |
| 22 from io import BytesIO | |
| 23 | |
| 24 # initializes blank parameters | |
| 25 chasm_classifier = '' | |
| 26 probed_filename = None | |
| 27 intersected_only = False | |
| 28 vcf_output = None | |
| 29 analysis_type = None | |
| 30 | |
| 31 # # Testing Command | |
| 32 # python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38 | |
| 33 # test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv | |
| 34 # test-data/error.tsv CHASM -—classifier Breast -—proBED | |
| 35 # test-data/MCF7_proBed.bed | |
| 36 parser = argparse.ArgumentParser() | |
| 37 parser.add_argument('cravatInput',help='The filename of the input ' | |
| 38 'CRAVAT-formatted tabular file ' | |
| 39 '(e.g., VCF)') | |
| 40 parser.add_argument('GRCh', help='The name of the human reference ' | |
| 41 'genome used for annotation: ' | |
| 42 'GRCh38/hg38 or GRCh37/hg19') | |
| 43 parser.add_argument('variant', help='The filename of the output ' | |
| 44 'variant file') | |
| 45 parser.add_argument('gene', help='The filename of the output gene ' | |
| 46 'variant report') | |
| 47 parser.add_argument('noncoding', help='The filename of the output ' | |
| 48 'non-coding variant report') | |
| 49 parser.add_argument('error', help='The filename of the output error ' | |
| 50 'file') | |
| 51 parser.add_argument('analysis', help='The machine-learning algorithm ' | |
| 52 'used for CRAVAT annotation (VEST' | |
| 53 ' and/or CHASM)') | |
| 54 parser.add_argument('--classifier', help='The cancer classifier for the' | |
| 55 ' CHASM algorithm') | |
| 56 parser.add_argument('--proBED', help='The filename of the proBED file ' | |
| 57 'containing peptides with genomic ' | |
| 58 'coordinates') | |
| 59 parser.add_argument('--intersectOnly', help='Specifies whether to ' | |
| 60 'analyze only variants ' | |
| 61 'intersected between the ' | |
| 62 'CRAVAT input and proBED ' | |
| 63 'file') | |
| 64 parser.add_argument('--vcfOutput', help='The output filename of the ' | |
| 65 'intersected VCF file') | |
| 66 | |
| 67 # assigns parsed arguments to appropriate variables | |
| 68 args = parser.parse_args() | |
| 69 input_filename = args.cravatInput | |
| 70 GRCh_build = args.GRCh | |
| 71 output_filename = args.variant | |
| 72 file_3 = args.gene | |
| 73 file_4 = args.noncoding | |
| 74 file_5 = args.error | |
| 75 if args.analysis != 'None': | |
| 76 analysis_type = args.analysis | |
| 77 if args.classifier: | |
| 78 chasm_classifier = args.classifier | |
| 79 if args.proBED: | |
| 80 probed_filename = args.proBED | |
| 81 if args.intersectOnly: | |
| 82 intersected_only = args.intersectOnly | |
| 83 if args.vcfOutput: | |
| 84 vcf_output = args.vcfOutput | |
| 85 | |
| 86 if analysis_type and '+' in analysis_type: | |
| 87 analysis_type = 'CHASM;VEST' | |
| 88 | |
| 89 # obtains the transcript's protein sequence using Ensembl API | |
| 90 def getSequence(transcript_id): | |
| 91 server = 'http://rest.ensembl.org' | |
| 92 ext = ('/sequence/id/' + transcript_id | |
| 93 + '?content-type=text/x-seqxml%2Bxml;' | |
| 94 'multiple_sequences=1;type=protein') | |
| 95 req = requests.get(server+ext, | |
| 96 headers={ "Content-Type" : "text/plain"}) | |
| 97 | |
| 98 if not req.ok: | |
| 99 return None | |
| 100 | |
| 101 root = ET.fromstring(req.content) | |
| 102 for child in root.iter('AAseq'): | |
| 103 return child.text | |
| 104 | |
| 105 # parses the proBED file as a list. | |
| 106 def loadProBED(): | |
| 107 proBED = [] | |
| 108 with open(probed_filename) as tsvin: | |
| 109 tsvreader = csv.reader(tsvin, delimiter='\t') | |
| 110 for i, row in enumerate(tsvreader): | |
| 111 proBED.append(row) | |
| 112 return proBED | |
| 113 | |
| 114 write_header = True | |
| 115 | |
| 116 | |
| 117 # Creates an VCF file that only contains variants that overlap with the | |
| 118 # proteogenomic input (proBED) file if the user specifies that they want | |
| 119 # to only include intersected variants or if they want to receive the | |
| 120 # intersected VCF as well. | |
| 121 if probed_filename and (vcf_output or intersected_only == 'true'): | |
| 122 proBED = loadProBED() | |
| 123 if not vcf_output: | |
| 124 vcf_output = 'intersected_input.vcf' | |
| 125 with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout: | |
| 126 tsvreader = csv.reader(tsvin, delimiter='\t') | |
| 127 tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ', | |
| 128 quoting=csv.QUOTE_NONE) | |
| 129 | |
| 130 for row in tsvreader: | |
| 131 if row == [] or row[0][0] == '#': | |
| 132 tsvout.writerow(row) | |
| 133 else: | |
| 134 genchrom = row[0] | |
| 135 genpos = int(row[1]) | |
| 136 | |
| 137 for peptide in proBED: | |
| 138 pepchrom = peptide[0] | |
| 139 pepposA = int(peptide[1]) | |
| 140 pepposB = int(peptide[2]) | |
| 141 if (genchrom == pepchrom and | |
| 142 pepposA <= genpos and | |
| 143 genpos <= pepposB): | |
| 144 tsvout.writerow(row) | |
| 145 break | |
| 146 if intersected_only == 'true': | |
| 147 input_filename = vcf_output | |
| 148 | |
| 149 # sets up the parameters for submission to the CRAVAT API | |
| 150 parameters = {'email':'rsajulga@umn.edu', | |
| 151 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off', | |
| 152 'functionalannotation': 'on', | |
| 153 'tsvreport' : 'on', | |
| 154 'mupitinput' : 'on'} | |
| 155 if analysis_type: | |
| 156 parameters['analyses'] = analysis_type | |
| 157 if chasm_classifier: | |
| 158 parameters['chasmclassifier'] = chasm_classifier | |
| 159 | |
| 160 # plugs in params to given URL | |
| 161 submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit', | |
| 162 files = {'inputfile':open(input_filename)}, | |
| 163 data = parameters) | |
| 164 | |
| 165 # makes the data a json dictionary; takes out only the job ID | |
| 166 jobid = json.loads(submit.text)['jobid'] | |
| 167 | |
| 168 # loops until we find a status equal to Success, then breaks | |
| 169 while True: | |
| 170 check = requests.get( | |
| 171 'http://www.cravat.us/CRAVAT/rest/service/status', | |
| 172 params = {'jobid' : jobid}) | |
| 173 status = json.loads(check.text)['status'] | |
| 174 resultfileurl = json.loads(check.text)['resultfileurl'] | |
| 175 #out_file.write(str(status) + ', ') | |
| 176 if status == 'Success': | |
| 177 #out_file.write('\t' + resultfileurl) | |
| 178 break | |
| 179 else: | |
| 180 time.sleep(2) | |
| 181 | |
| 182 # obtains the zipfile created by CRAVAT and loads the variants and VAD | |
| 183 # file for processing | |
| 184 r = requests.get(resultfileurl, stream=True) | |
| 185 url = urlopen(resultfileurl) | |
| 186 zipfile = ZipFile(BytesIO(r.content)) | |
| 187 variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines() | |
| 188 vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines() | |
| 189 | |
| 190 # reads and writes the gene, noncoding, and error files | |
| 191 open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv')) | |
| 192 open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv')) | |
| 193 open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv')) | |
| 194 | |
| 195 | |
| 196 | |
| 197 if probed_filename and not vcf_output: | |
| 198 proBED = loadProBED() | |
| 199 | |
| 200 if probed_filename: | |
| 201 with open(output_filename, 'w') as tsvout: | |
| 202 tsvout = csv.writer(tsvout, | |
| 203 delimiter='\t', | |
| 204 escapechar=' ', | |
| 205 quoting=csv.QUOTE_NONE) | |
| 206 n = 11 #Index for proteogenomic column start | |
| 207 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)') | |
| 208 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)') | |
| 209 pep_muts = {} | |
| 210 pep_map = {} | |
| 211 rows = [] | |
| 212 for row in vad: | |
| 213 row = row.decode().split('\t') | |
| 214 row[-1] = row[-1].replace('\n','') | |
| 215 if row and row[0] and not row[0].startswith('#'): | |
| 216 # checks if the row begins with input line | |
| 217 if row[0].startswith('Input line'): | |
| 218 vad_headers = row | |
| 219 | |
| 220 else: | |
| 221 # Initially screens through the output Variant | |
| 222 # Additional Details to catch mutations on | |
| 223 # same peptide region | |
| 224 genchrom = row[vad_headers.index('Chromosome')] | |
| 225 genpos = int(row[vad_headers.index('Position')]) | |
| 226 aa_change = row[vad_headers.index('Protein sequence change')] | |
| 227 input_line = row[vad_headers.index('Input line')] | |
| 228 | |
| 229 for peptide in proBED: | |
| 230 pepseq = peptide[3] | |
| 231 pepchrom = peptide[0] | |
| 232 pepposA = int(peptide[1]) | |
| 233 pepposB = int(peptide[2]) | |
| 234 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB: | |
| 235 strand = row[vad_headers.index('Strand')] | |
| 236 transcript_strand = row[vad_headers.index('S.O. transcript strand')] | |
| 237 | |
| 238 # Calculates the position of the variant | |
| 239 # amino acid(s) on peptide | |
| 240 if transcript_strand == strand: | |
| 241 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1) | |
| 242 if (strand == '-' or transcript_strand == '-' | |
| 243 or aa_peppos >= len(pepseq)): | |
| 244 aa_peppos = int(math.floor((pepposB - genpos)/3.0)) | |
| 245 if pepseq in pep_muts: | |
| 246 if aa_change not in pep_muts[pepseq]: | |
| 247 pep_muts[pepseq][aa_change] = [aa_peppos] | |
| 248 else: | |
| 249 if aa_peppos not in pep_muts[pepseq][aa_change]: | |
| 250 pep_muts[pepseq][aa_change].append(aa_peppos) | |
| 251 else: | |
| 252 pep_muts[pepseq] = {aa_change : [aa_peppos]} | |
| 253 # Stores the intersect information by mapping | |
| 254 # Input Line (CRAVAT output) to peptide sequence. | |
| 255 if input_line in pep_map: | |
| 256 if pepseq not in pep_map[input_line]: | |
| 257 pep_map[input_line].append(pepseq) | |
| 258 else: | |
| 259 pep_map[input_line] = [pepseq] | |
| 260 # TODO: Need to obtain strand information as | |
| 261 # well i.e., positive (+) or negative (-) | |
| 262 | |
| 263 | |
| 264 with open(output_filename, 'w') as tsvout: | |
| 265 tsvout = csv.writer(tsvout, delimiter='\t', escapechar='', | |
| 266 quoting=csv.QUOTE_NONE) | |
| 267 headers = [] | |
| 268 duplicate_indices = [] | |
| 269 | |
| 270 # loops through each row in the Variant Additional Details (VAD) file | |
| 271 for x, row in enumerate(variants): | |
| 272 row = row.decode().split('\t') | |
| 273 row[-1] = row[-1].replace('\n','') | |
| 274 # sets row_2 equal to the same row in Variant Result (VR) file | |
| 275 row_2 = vad[x].decode().split('\t') | |
| 276 row_2[-1] = row_2[-1].replace('\n','') | |
| 277 | |
| 278 # checks if row is empty or if the first term contains '#' | |
| 279 if not row or not row[0] or row[0].startswith('#'): | |
| 280 if row[0]: | |
| 281 tsvout.writerow(row) | |
| 282 else: | |
| 283 if row[0].startswith('Input line'): | |
| 284 # goes through each value in the headers list in VAD | |
| 285 headers = row | |
| 286 # loops through the Keys in VR | |
| 287 for i,value in enumerate(row_2): | |
| 288 #Checks if the value is already in headers | |
| 289 if value in headers: | |
| 290 duplicate_indices.append(i) | |
| 291 continue | |
| 292 #else adds the header to headers | |
| 293 else: | |
| 294 headers.append(value) | |
| 295 # adds appropriate headers when proteomic input is supplied | |
| 296 if probed_filename: | |
| 297 headers.insert(n, 'Variant peptide') | |
| 298 headers.insert(n, 'Reference peptide') | |
| 299 tsvout.writerow(headers) | |
| 300 else: | |
| 301 cells = [] | |
| 302 # goes through each value in the next list | |
| 303 for value in row: | |
| 304 #adds it to cells | |
| 305 cells.append(value) | |
| 306 # goes through each value from the VR file after position | |
| 307 # 11 (After it is done repeating from VAD file) | |
| 308 for i,value in enumerate(row_2): | |
| 309 # adds in the rest of the values to cells | |
| 310 if i not in duplicate_indices: | |
| 311 # Skips the initial 11 columns and the | |
| 312 # VEST p-value (already in VR file) | |
| 313 cells.append(value) | |
| 314 | |
| 315 # Verifies the peptides intersected previously through | |
| 316 # sequences obtained from Ensembl's API | |
| 317 if probed_filename: | |
| 318 cells.insert(n,'') | |
| 319 cells.insert(n,'') | |
| 320 input_line = cells[headers.index('Input line')] | |
| 321 if input_line in pep_map: | |
| 322 pepseq = pep_map[input_line][0] | |
| 323 aa_changes = pep_muts[pepseq] | |
| 324 transcript_id = cells[headers.index('S.O. transcript')] | |
| 325 ref_fullseq = getSequence(transcript_id) | |
| 326 # Checks the other S.O. transcripts if the primary | |
| 327 # S.O. transcript has no sequence available | |
| 328 if not ref_fullseq: | |
| 329 transcripts = cells[headers.index('S.O. all transcripts')] | |
| 330 for transcript in transcripts.split(','): | |
| 331 if transcript: | |
| 332 mat = SOtranscripts.search(transcript) | |
| 333 ref_fullseq = getSequence(mat.group(1)) | |
| 334 if ref_fullseq: | |
| 335 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
| 336 break | |
| 337 # Resubmits the previous transcripts without | |
| 338 # extensions if all S.O. transcripts fail to | |
| 339 # provide a sequence | |
| 340 if not ref_fullseq: | |
| 341 transcripts = cells[headers.index('S.O. all transcripts')] | |
| 342 for transcript in transcripts.split(','): | |
| 343 if transcript: | |
| 344 mat = SOtranscripts.search(transcript) | |
| 345 ref_fullseq = getSequence(mat.group(1).split('.')[0]) | |
| 346 if ref_fullseq: | |
| 347 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]} | |
| 348 break | |
| 349 if ref_fullseq: | |
| 350 # Sorts the amino acid changes | |
| 351 positions = {} | |
| 352 for aa_change in aa_changes: | |
| 353 m = reg_seq_change.search(aa_change) | |
| 354 aa_protpos = int(m.group(2)) | |
| 355 aa_peppos = aa_changes[aa_change][0] | |
| 356 aa_startpos = aa_protpos - aa_peppos - 1 | |
| 357 if aa_startpos in positions: | |
| 358 positions[aa_startpos].append(aa_change) | |
| 359 else: | |
| 360 positions[aa_startpos] = [aa_change] | |
| 361 # Goes through the sorted categories to mutate the Ensembl peptide | |
| 362 # (uses proBED peptide as a reference) | |
| 363 for pep_protpos in positions: | |
| 364 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)] | |
| 365 muts = positions[pep_protpos] | |
| 366 options = [] | |
| 367 mut_seq = ref_seq | |
| 368 for mut in muts: | |
| 369 m = reg_seq_change.search(mut) | |
| 370 ref_aa = m.group(1) | |
| 371 mut_pos = int(m.group(2)) | |
| 372 alt_aa = m.group(3) | |
| 373 pep_mutpos = mut_pos - pep_protpos - 1 | |
| 374 if (ref_seq[pep_mutpos] == ref_aa | |
| 375 and (pepseq[pep_mutpos] == alt_aa | |
| 376 or pepseq[pep_mutpos] == ref_aa)): | |
| 377 if pepseq[pep_mutpos] == ref_aa: | |
| 378 mut_seq = (mut_seq[:pep_mutpos] + ref_aa | |
| 379 + mut_seq[pep_mutpos+1:]) | |
| 380 else: | |
| 381 mut_seq = (mut_seq[:pep_mutpos] + alt_aa | |
| 382 + mut_seq[pep_mutpos+1:]) | |
| 383 else: | |
| 384 break | |
| 385 # Adds the mutated peptide and reference peptide if mutated correctly | |
| 386 if pepseq == mut_seq: | |
| 387 cells[n+1] = pepseq | |
| 388 cells[n] = ref_seq | |
| 389 tsvout.writerow(cells) | |
| 390 |
