Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 0:6430407e5869 draft
Uploaded
author | galaxyp |
---|---|
date | Fri, 03 Apr 2015 14:55:49 -0400 |
parents | |
children | 0c1ee95282fa |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6430407e5869 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2015, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 import json | |
16 import logging | |
17 import optparse | |
18 from optparse import OptionParser | |
19 import os | |
20 import sys | |
21 import re | |
22 import urllib | |
23 import urllib2 | |
24 try: | |
25 import xml.etree.cElementTree as ET | |
26 except ImportError: | |
27 import xml.etree.ElementTree as ET | |
28 | |
29 def warn_err(msg,exit_code=1): | |
30 sys.stderr.write(msg) | |
31 if exit_code: | |
32 sys.exit(exit_code) | |
33 | |
34 def read_fasta(fp): | |
35 name, seq = None, [] | |
36 for line in fp: | |
37 line = line.rstrip() | |
38 if line.startswith(">"): | |
39 if name: yield (name, ''.join(seq)) | |
40 name, seq = line, [] | |
41 else: | |
42 seq.append(line) | |
43 if name: yield (name, ''.join(seq)) | |
44 | |
45 def read_mzid(fp): | |
46 peptides = [] | |
47 for event, elem in ET.iterparse(fp): | |
48 if event == 'end': | |
49 if re.search('PeptideSequence',elem.tag): | |
50 peptides.append(elem.text) | |
51 return peptides | |
52 | |
53 def read_pepxml(fp): | |
54 peptides = [] | |
55 for event, elem in ET.iterparse(fp): | |
56 if event == 'end': | |
57 if re.search('search_hit',elem.tag): | |
58 peptides.append(elem.get('peptide')) | |
59 return peptides | |
60 | |
61 def __main__(): | |
62 #Parse Command Line | |
63 parser = optparse.OptionParser() | |
64 # unipept API | |
65 parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) | |
66 # files | |
67 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | |
68 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | |
69 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | |
70 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | |
71 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | |
72 # Unipept Flags | |
73 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | |
74 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | |
75 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | |
76 # Warn vs Error Flag | |
77 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | |
78 # outputs | |
79 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
80 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
81 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
82 parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' ) | |
83 (options, args) = parser.parse_args() | |
84 invalid_ec = 2 if options.strict else None | |
85 peptides = [] | |
86 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | |
87 ## Get peptide sequences | |
88 if options.mzid: | |
89 peptides += read_mzid(options.mzid) | |
90 if options.pepxml: | |
91 peptides += read_pepxml(options.pepxml) | |
92 if options.tabular: | |
93 with open(options.tabular) as fp: | |
94 for i,line in enumerate(fp): | |
95 if line.strip() == '' or line.startswith('#'): | |
96 continue | |
97 fields = line.rstrip('\n').split('\t') | |
98 peptide = fields[options.column] | |
99 if not re.match(pep_pat,peptide): | |
100 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec) | |
101 peptides.append(peptide) | |
102 if options.fasta: | |
103 with open(options.fasta) as fp: | |
104 for id, peptide in read_fasta(fp): | |
105 if not re.match(pep_pat,peptide): | |
106 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec) | |
107 peptides.append(peptide) | |
108 if args and len(args) > 0: | |
109 for i,peptide in enumerate(args): | |
110 if not re.match(pep_pat,peptide): | |
111 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | |
112 peptides.append(peptide) | |
113 if len(peptides) < 1: | |
114 warn_err("No peptides input!",exit_code=1) | |
115 ## unipept | |
116 post_data = [] | |
117 if options.equate_il: | |
118 post_data.append(("equate_il","true")) | |
119 if options.names: | |
120 post_data.append(("extra","true")) | |
121 post_data.append(("names","true")) | |
122 elif options.extra: | |
123 post_data.append(("extra","true")) | |
124 post_data += [('input[]', x) for x in peptides] | |
125 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | |
126 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept | |
127 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | |
128 resp = json.loads( urllib2.urlopen( req ).read() ) | |
129 ## output results | |
130 if not (options.mismatch or options.json or options.tsv or options.csv): | |
131 print >> sys.stdout, str(resp) | |
132 if options.mismatch: | |
133 peptides_matched = [] | |
134 for i,pdict in enumerate(resp): | |
135 peptides_matched.append(pdict['peptide']) | |
136 with open(options.mismatch,'w') as outputFile: | |
137 for peptide in peptides: | |
138 if not peptide in peptides_matched: | |
139 outputFile.write("%s\n" % peptide) | |
140 if options.json: | |
141 with open(options.json,'w') as outputFile: | |
142 outputFile.write(str(resp)) | |
143 if options.tsv or options.csv: | |
144 # 'pept2lca','pept2taxa','pept2prot' | |
145 pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | |
146 pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | |
147 column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order | |
148 found_keys = set() | |
149 results = [] | |
150 for i,pdict in enumerate(resp): | |
151 results.append(pdict) | |
152 found_keys |= set(pdict.keys()) | |
153 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
154 column_names = [] | |
155 column_keys = [] | |
156 for col in column_order: | |
157 if col in found_keys: | |
158 column_names.append(col) | |
159 column_keys.append(col) | |
160 elif options.extra or options.names: | |
161 col_id = col+'_id' | |
162 col_name = col+'_name' | |
163 if options.extra: | |
164 if col_id in found_keys: | |
165 column_names.append(col_id) | |
166 column_keys.append(col_id) | |
167 if options.names: | |
168 if col_name in found_keys: | |
169 column_names.append(col) | |
170 column_keys.append(col_name) | |
171 else: | |
172 if col+'_name' in found_keys: | |
173 column_names.append(col) | |
174 column_keys.append(col+'_name') | |
175 elif col+'_id' in found_keys: | |
176 column_names.append(col) | |
177 column_keys.append(col+'_id') | |
178 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
179 taxa = [] | |
180 for i,pdict in enumerate(results): | |
181 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
182 taxa.append(vals) | |
183 if options.tsv: | |
184 with open(options.tsv,'w') as outputFile: | |
185 outputFile.write("#%s\n"% '\t'.join(column_names)) | |
186 for vals in taxa: | |
187 outputFile.write("%s\n"% '\t'.join(vals)) | |
188 if options.csv: | |
189 with open(options.csv,'w') as outputFile: | |
190 outputFile.write("%s\n"% ','.join(column_names)) | |
191 for vals in taxa: | |
192 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | |
193 | |
194 if __name__ == "__main__" : __main__() |