annotate cravatp_submit.py @ 3:a018c44dc18b draft default tip

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
author galaxyp
date Fri, 07 Sep 2018 16:53:05 -0400
parents 2c7bcc1219fc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
1 # -*- coding: utf-8 -*-
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
2 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
3 # Author: Ray W. Sajulga
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
4 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
5 #
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
6
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
7 import requests # pipenv requests
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
8 import json
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
9 import time
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
10 import urllib
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
11 import sys
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
12 import csv
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
13 import re
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
14 import math
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
15 import argparse
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
16 from xml.etree import ElementTree as ET
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
17 from zipfile import ZipFile
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
18 try: #Python 3
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
19 from urllib.request import urlopen
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
20 except ImportError: #Python 2
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
21 from urllib2 import urlopen
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
22 from io import BytesIO
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
23
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
24 # initializes blank parameters
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
25 chasm_classifier = ''
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
26 probed_filename = None
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
27 all_intersect = False
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
28 vcf_output = None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
29 analysis_type = None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
30
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
31 # # Testing Command
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
32 # python cravatp_submit.py test-data/Freebayes_two-variants.vcf GRCh38 test-data/variant.tsv test-data/gene.tsv test-data/noncoding.tsv test-data/error.tsv CHASM -—classifier Breast -—proBED test-data/MCF7_proBed.bed
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
33 parser = argparse.ArgumentParser()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
34 parser.add_argument('cravatInput',help='The filename of the input '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
35 'CRAVAT-formatted tabular file '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
36 '(e.g., VCF)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
37 parser.add_argument('GRCh', help='The name of the human reference '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
38 'genome used for annotation: '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
39 'GRCh38/hg38 or GRCh37/hg19')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
40 parser.add_argument('variant', help='The filename of the output '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
41 'variant file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
42 parser.add_argument('gene', help='The filename of the output gene '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
43 'variant report')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
44 parser.add_argument('noncoding', help='The filename of the output '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
45 'non-coding variant report')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
46 parser.add_argument('error', help='The filename of the output error '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
47 'file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
48 parser.add_argument('analysis', help='The machine-learning algorithm '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
49 'used for CRAVAT annotation (VEST'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
50 ' and/or CHASM)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
51 parser.add_argument('--classifier', help='The cancer classifier for the'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
52 ' CHASM algorithm')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
53 parser.add_argument('--proBED', help='The filename of the proBED file '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
54 'containing peptides with genomic '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
55 'coordinates')
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
56 parser.add_argument('--allIntersect', help='Specifies whether to '
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
57 'analyze all variants')
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
58 parser.add_argument('--vcfOutput', help='The output filename of the '
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
59 'intersected VCF file')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
60
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
61 # assigns parsed arguments to appropriate variables
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
62 args = parser.parse_args()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
63 input_filename = args.cravatInput
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
64 GRCh_build = args.GRCh
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
65 output_filename = args.variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
66 file_3 = args.gene
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
67 file_4 = args.noncoding
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
68 file_5 = args.error
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
69 if args.analysis != 'None':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
70 analysis_type = args.analysis
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
71 if args.classifier:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
72 chasm_classifier = args.classifier
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
73 if args.proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
74 probed_filename = args.proBED
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
75 if args.allIntersect:
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
76 all_intersect = args.allIntersect
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
77 if args.vcfOutput:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
78 vcf_output = args.vcfOutput
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
79
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
80 if analysis_type and '+' in analysis_type:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
81 analysis_type = 'CHASM;VEST'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
82
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
83 # obtains the transcript's protein sequence using Ensembl API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
84 def getSequence(transcript_id):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
85 server = 'http://rest.ensembl.org'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
86 ext = ('/sequence/id/' + transcript_id
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
87 + '?content-type=text/x-seqxml%2Bxml;'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
88 'multiple_sequences=1;type=protein')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
89 req = requests.get(server+ext,
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
90 headers={ "Content-Type" : "text/plain"})
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
91
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
92 if not req.ok:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
93 return None
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
94
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
95 root = ET.fromstring(req.content)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
96 for child in root.iter('AAseq'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
97 return child.text
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
98
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
99 # parses the proBED file as a list.
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
100 def loadProBED():
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
101 proBED = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
102 with open(probed_filename) as tsvin:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
103 tsvreader = csv.reader(tsvin, delimiter='\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
104 for i, row in enumerate(tsvreader):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
105 proBED.append(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
106 return proBED
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
107
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
108 write_header = True
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
109
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
110
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
111 # Creates an VCF file that only contains variants that overlap with the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
112 # proteogenomic input (proBED) file if the user specifies that they want
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
113 # to only include intersected variants or if they want to receive the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
114 # intersected VCF as well.
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
115 if probed_filename and (vcf_output or all_intersect == 'false'):
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
116 proBED = loadProBED()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
117 if not vcf_output:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
118 vcf_output = 'intersected_input.vcf'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
119 with open(input_filename) as tsvin, open(vcf_output, 'wb') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
120 tsvreader = csv.reader(tsvin, delimiter='\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
121 tsvout = csv.writer(tsvout, delimiter='\t', escapechar=' ',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
122 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
123
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
124 for row in tsvreader:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
125 if row == [] or row[0][0] == '#':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
126 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
127 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
128 genchrom = row[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
129 genpos = int(row[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
130
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
131 for peptide in proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
132 pepchrom = peptide[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
133 pepposA = int(peptide[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
134 pepposB = int(peptide[2])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
135 if (genchrom == pepchrom and
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
136 pepposA <= genpos and
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
137 genpos <= pepposB):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
138 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
139 break
3
a018c44dc18b planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cravatp_score_and_annotate commit d80e60ce74aabe64e131d560085af099d52b81cf-dirty
galaxyp
parents: 1
diff changeset
140 if all_intersect == 'false':
1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
141 input_filename = vcf_output
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
142
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
143 # sets up the parameters for submission to the CRAVAT API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
144 parameters = {'email':'rsajulga@umn.edu',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
145 'hg19': 'on' if GRCh_build == 'GRCh37' else 'off',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
146 'functionalannotation': 'on',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
147 'tsvreport' : 'on',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
148 'mupitinput' : 'on'}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
149 if analysis_type:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
150 parameters['analyses'] = analysis_type
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
151 if chasm_classifier:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
152 parameters['chasmclassifier'] = chasm_classifier
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
153
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
154 # plugs in params to given URL
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
155 submit = requests.post('http://www.cravat.us/CRAVAT/rest/service/submit',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
156 files = {'inputfile':open(input_filename)},
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
157 data = parameters)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
158
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
159 # makes the data a json dictionary; takes out only the job ID
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
160 jobid = json.loads(submit.text)['jobid']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
161
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
162 # loops until we find a status equal to Success, then breaks
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
163 while True:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
164 check = requests.get(
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
165 'http://www.cravat.us/CRAVAT/rest/service/status',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
166 params = {'jobid' : jobid})
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
167 status = json.loads(check.text)['status']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
168 resultfileurl = json.loads(check.text)['resultfileurl']
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
169 #out_file.write(str(status) + ', ')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
170 if status == 'Success':
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
171 #out_file.write('\t' + resultfileurl)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
172 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
173 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
174 time.sleep(2)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
175
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
176 # obtains the zipfile created by CRAVAT and loads the variants and VAD
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
177 # file for processing
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
178 r = requests.get(resultfileurl, stream=True)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
179 url = urlopen(resultfileurl)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
180 zipfile = ZipFile(BytesIO(r.content))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
181 variants = zipfile.open(jobid + '/Variant.Result.tsv').readlines()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
182 vad = zipfile.open(jobid + '/Variant_Additional_Details.Result.tsv').readlines()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
183
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
184 # reads and writes the gene, noncoding, and error files
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
185 open(file_3, 'wb').write(zipfile.read(jobid + '/Gene_Level_Analysis.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
186 open(file_4, 'wb').write(zipfile.read(jobid + '/Variant_Non-coding.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
187 open(file_5, 'wb').write(zipfile.read(jobid + '/Input_Errors.Result.tsv'))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
188
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
189
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
190
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
191 if probed_filename and not vcf_output:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
192 proBED = loadProBED()
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
193
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
194 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
195 with open(output_filename, 'w') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
196 tsvout = csv.writer(tsvout,
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
197 delimiter='\t',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
198 escapechar=' ',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
199 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
200 n = 11 #Index for proteogenomic column start
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
201 reg_seq_change = re.compile('([A-Z]+)(\d+)([A-Z]+)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
202 SOtranscripts = re.compile('([A-Z]+[\d\.]+):([A-Z]+\d+[A-Z]+)')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
203 pep_muts = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
204 pep_map = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
205 rows = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
206 for row in vad:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
207 row = row.decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
208 row[-1] = row[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
209 if row and row[0] and not row[0].startswith('#'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
210 # checks if the row begins with input line
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
211 if row[0].startswith('Input line'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
212 vad_headers = row
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
213
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
214 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
215 # Initially screens through the output Variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
216 # Additional Details to catch mutations on
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
217 # same peptide region
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
218 genchrom = row[vad_headers.index('Chromosome')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
219 genpos = int(row[vad_headers.index('Position')])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
220 aa_change = row[vad_headers.index('Protein sequence change')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
221 input_line = row[vad_headers.index('Input line')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
222
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
223 for peptide in proBED:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
224 pepseq = peptide[3]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
225 pepchrom = peptide[0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
226 pepposA = int(peptide[1])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
227 pepposB = int(peptide[2])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
228 if genchrom == pepchrom and pepposA <= genpos and genpos <= pepposB:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
229 strand = row[vad_headers.index('Strand')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
230 transcript_strand = row[vad_headers.index('S.O. transcript strand')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
231
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
232 # Calculates the position of the variant
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
233 # amino acid(s) on peptide
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
234 if transcript_strand == strand:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
235 aa_peppos = int(math.ceil((genpos - pepposA)/3.0) - 1)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
236 if (strand == '-' or transcript_strand == '-'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
237 or aa_peppos >= len(pepseq)):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
238 aa_peppos = int(math.floor((pepposB - genpos)/3.0))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
239 if pepseq in pep_muts:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
240 if aa_change not in pep_muts[pepseq]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
241 pep_muts[pepseq][aa_change] = [aa_peppos]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
242 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
243 if aa_peppos not in pep_muts[pepseq][aa_change]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
244 pep_muts[pepseq][aa_change].append(aa_peppos)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
245 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
246 pep_muts[pepseq] = {aa_change : [aa_peppos]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
247 # Stores the intersect information by mapping
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
248 # Input Line (CRAVAT output) to peptide sequence.
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
249 if input_line in pep_map:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
250 if pepseq not in pep_map[input_line]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
251 pep_map[input_line].append(pepseq)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
252 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
253 pep_map[input_line] = [pepseq]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
254 # TODO: Need to obtain strand information as
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
255 # well i.e., positive (+) or negative (-)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
256
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
257
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
258 with open(output_filename, 'w') as tsvout:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
259 tsvout = csv.writer(tsvout, delimiter='\t', escapechar='',
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
260 quoting=csv.QUOTE_NONE)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
261 headers = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
262 duplicate_indices = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
263
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
264 # loops through each row in the Variant Additional Details (VAD) file
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
265 for x, row in enumerate(variants):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
266 row = row.decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
267 row[-1] = row[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
268 # sets row_2 equal to the same row in Variant Result (VR) file
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
269 row_2 = vad[x].decode().split('\t')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
270 row_2[-1] = row_2[-1].replace('\n','')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
271
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
272 # checks if row is empty or if the first term contains '#'
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
273 if not row or not row[0] or row[0].startswith('#'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
274 if row[0]:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
275 tsvout.writerow(row)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
276 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
277 if row[0].startswith('Input line'):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
278 # goes through each value in the headers list in VAD
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
279 headers = row
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
280 # loops through the Keys in VR
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
281 for i,value in enumerate(row_2):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
282 #Checks if the value is already in headers
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
283 if value in headers:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
284 duplicate_indices.append(i)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
285 continue
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
286 #else adds the header to headers
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
287 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
288 headers.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
289 # adds appropriate headers when proteomic input is supplied
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
290 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
291 headers.insert(n, 'Variant peptide')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
292 headers.insert(n, 'Reference peptide')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
293 tsvout.writerow(headers)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
294 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
295 cells = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
296 # goes through each value in the next list
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
297 for value in row:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
298 #adds it to cells
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
299 cells.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
300 # goes through each value from the VR file after position
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
301 # 11 (After it is done repeating from VAD file)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
302 for i,value in enumerate(row_2):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
303 # adds in the rest of the values to cells
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
304 if i not in duplicate_indices:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
305 # Skips the initial 11 columns and the
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
306 # VEST p-value (already in VR file)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
307 cells.append(value)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
308
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
309 # Verifies the peptides intersected previously through
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
310 # sequences obtained from Ensembl's API
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
311 if probed_filename:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
312 cells.insert(n,'')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
313 cells.insert(n,'')
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
314 input_line = cells[headers.index('Input line')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
315 if input_line in pep_map:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
316 pepseq = pep_map[input_line][0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
317 aa_changes = pep_muts[pepseq]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
318 transcript_id = cells[headers.index('S.O. transcript')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
319 ref_fullseq = getSequence(transcript_id)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
320 # Checks the other S.O. transcripts if the primary
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
321 # S.O. transcript has no sequence available
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
322 if not ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
323 transcripts = cells[headers.index('S.O. all transcripts')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
324 for transcript in transcripts.split(','):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
325 if transcript:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
326 mat = SOtranscripts.search(transcript)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
327 ref_fullseq = getSequence(mat.group(1))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
328 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
329 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
330 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
331 # Resubmits the previous transcripts without
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
332 # extensions if all S.O. transcripts fail to
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
333 # provide a sequence
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
334 if not ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
335 transcripts = cells[headers.index('S.O. all transcripts')]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
336 for transcript in transcripts.split(','):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
337 if transcript:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
338 mat = SOtranscripts.search(transcript)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
339 ref_fullseq = getSequence(mat.group(1).split('.')[0])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
340 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
341 aa_changes = {mat.group(2): [aa_changes.values()[0][0]]}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
342 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
343 if ref_fullseq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
344 # Sorts the amino acid changes
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
345 positions = {}
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
346 for aa_change in aa_changes:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
347 m = reg_seq_change.search(aa_change)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
348 aa_protpos = int(m.group(2))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
349 aa_peppos = aa_changes[aa_change][0]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
350 aa_startpos = aa_protpos - aa_peppos - 1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
351 if aa_startpos in positions:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
352 positions[aa_startpos].append(aa_change)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
353 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
354 positions[aa_startpos] = [aa_change]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
355 # Goes through the sorted categories to mutate the Ensembl peptide
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
356 # (uses proBED peptide as a reference)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
357 for pep_protpos in positions:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
358 ref_seq = ref_fullseq[pep_protpos:pep_protpos+len(pepseq)]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
359 muts = positions[pep_protpos]
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
360 options = []
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
361 mut_seq = ref_seq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
362 for mut in muts:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
363 m = reg_seq_change.search(mut)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
364 ref_aa = m.group(1)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
365 mut_pos = int(m.group(2))
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
366 alt_aa = m.group(3)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
367 pep_mutpos = mut_pos - pep_protpos - 1
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
368 if (ref_seq[pep_mutpos] == ref_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
369 and (pepseq[pep_mutpos] == alt_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
370 or pepseq[pep_mutpos] == ref_aa)):
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
371 if pepseq[pep_mutpos] == ref_aa:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
372 mut_seq = (mut_seq[:pep_mutpos] + ref_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
373 + mut_seq[pep_mutpos+1:])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
374 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
375 mut_seq = (mut_seq[:pep_mutpos] + alt_aa
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
376 + mut_seq[pep_mutpos+1:])
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
377 else:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
378 break
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
379 # Adds the mutated peptide and reference peptide if mutated correctly
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
380 if pepseq == mut_seq:
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
381 cells[n+1] = pepseq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
382 cells[n] = ref_seq
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
383 tsvout.writerow(cells)
2c7bcc1219fc Updated cravatool to version 1.0 with updated formatting and new CRAVAT target URL.
galaxyp
parents:
diff changeset
384