annotate unipept.py @ 1:0c1ee95282fa draft

Uploaded
author galaxyp
date Tue, 14 Apr 2015 16:44:22 -0400
parents 6430407e5869
children 34758ab8aaa4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
2 """
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
3 #
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
4 #------------------------------------------------------------------------------
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
5 # University of Minnesota
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
6 # Copyright 2015, Regents of the University of Minnesota
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
7 #------------------------------------------------------------------------------
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
8 # Author:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
9 #
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
10 # James E Johnson
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
11 #
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
12 #------------------------------------------------------------------------------
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
13 """
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
14
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
15 import json
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
16 import logging
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
17 import optparse
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
18 from optparse import OptionParser
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
19 import os
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
20 import sys
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
21 import re
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
22 import urllib
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
23 import urllib2
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
24 try:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
25 import xml.etree.cElementTree as ET
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
26 except ImportError:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
27 import xml.etree.ElementTree as ET
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
28
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
29 def warn_err(msg,exit_code=1):
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
30 sys.stderr.write(msg)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
31 if exit_code:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
32 sys.exit(exit_code)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
33
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
39
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
40 def __main__():
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
41 version = '1.1'
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
43
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
44 def read_tabular(filepath,col):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
45 peptides = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
46 with open(filepath) as fp:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
47 for i,line in enumerate(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
48 if line.strip() == '' or line.startswith('#'):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
49 continue
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
50 fields = line.rstrip('\n').split('\t')
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
51 peptide = fields[col]
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
52 if not re.match(pep_pat,peptide):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
53 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
54 peptides.append(peptide)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
55 return peptides
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
56
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
57 def get_fasta_entries(fp):
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
58 name, seq = None, []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
59 for line in fp:
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
60 line = line.rstrip()
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
61 if line.startswith(">"):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
62 if name: yield (name, ''.join(seq))
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
63 name, seq = line, []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
64 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
65 seq.append(line)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
66 if name: yield (name, ''.join(seq))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
67
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
68 def read_fasta(filepath):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
69 peptides = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
70 with open(filepath) as fp:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
71 for id, peptide in get_fasta_entries(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
72 if not re.match(pep_pat,peptide):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
73 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
74 peptides.append(peptide)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
75 return peptides
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
76
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
77 def read_mzid(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
78 peptides = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
79 for event, elem in ET.iterparse(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
80 if event == 'end':
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
81 if re.search('PeptideSequence',elem.tag):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
82 peptides.append(elem.text)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
83 return peptides
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
84
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
85 def read_pepxml(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
86 peptides = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
87 for event, elem in ET.iterparse(fp):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
88 if event == 'end':
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
89 if re.search('search_hit',elem.tag):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
90 peptides.append(elem.get('peptide'))
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
91 return peptides
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
92
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
93 def best_match(peptide,matches):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
94 if not matches:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
95 return None
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
96 elif len(matches) == 1:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
97 return matches[0].copy()
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
98 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
99 # find the most specific match (peptide is always the first column order field)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
100 for col in reversed(pept2lca_extra_column_order[1:]):
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
101 col_id = col+"_id" if options.extra else col
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
102 for match in matches:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
103 if 'taxon_rank' in match and match['taxon_rank'] == col:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
104 return match.copy()
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
105 if col_id in match and match[col_id]:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
106 return match.copy()
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
107 return None
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
108
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
109 #Parse Command Line
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
110 parser = optparse.OptionParser()
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
111 # unipept API choice
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
113 # input files
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
119 # Unipept Flags
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
123 # output fields
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
124 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
125 # Warn vs Error Flag
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
126 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
127 # output files
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
128 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
129 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
130 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
131 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
132 # debug
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
133 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
134 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
135 (options, args) = parser.parse_args()
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
136 if options.version:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
137 print >> sys.stdout,"%s" % version
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
138 sys.exit(0)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
139 invalid_ec = 2 if options.strict else None
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
140 peptides = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
141 ## Get peptide sequences
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
142 if options.mzid:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
143 peptides += read_mzid(options.mzid)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
144 if options.pepxml:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
145 peptides += read_pepxml(options.pepxml)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
146 if options.tabular:
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
147 peptides += read_tabular(options.tabular,options.column)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
148 if options.fasta:
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
149 peptides += read_fasta(options.fasta)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
150 if args and len(args) > 0:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
151 for i,peptide in enumerate(args):
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
152 if not re.match(pep_pat,peptide):
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
153 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
154 peptides.append(peptide)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
155 if len(peptides) < 1:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
156 warn_err("No peptides input!",exit_code=1)
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
157 column_order = pept2lca_column_order
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
158 if options.unipept == 'pept2prot':
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
159 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
160 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
161 if options.extra or options.names:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
162 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
163 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
164 column_order = pept2lca_column_order
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
165 ## map to tryptic peptides
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
166 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
167 partToPeps = {}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
168 for peptide, parts in pepToParts.iteritems():
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
169 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
170 for part in parts:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
171 if len(part) > 50:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
172 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
173 if 5 <= len(part) <= 50:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
174 partToPeps.setdefault(part,[]).append(peptide)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
175 trypticPeptides = partToPeps.keys()
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
176 ## unipept
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
177 post_data = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
178 if options.equate_il:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
179 post_data.append(("equate_il","true"))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
180 if options.names:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
181 post_data.append(("extra","true"))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
182 post_data.append(("names","true"))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
183 elif options.extra:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
184 post_data.append(("extra","true"))
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
185 post_data += [('input[]', x) for x in trypticPeptides]
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
186 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
187 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
188 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
189 unipept_resp = json.loads( urllib2.urlopen( req ).read() )
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
190 unmatched_peptides = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
191 peptideMatches = []
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
192 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
193 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
194 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
195 ## multiple entries per trypticPeptide for pep2prot or pep2taxa
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
196 mapping = {}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
197 for match in unipept_resp:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
198 mapping.setdefault(match['peptide'],[]).append(match)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
199 for peptide in peptides:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
200 # Get the intersection of matches to the tryptic parts
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
201 keyToMatch = None
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
202 for part in pepToParts[peptide]:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
203 if part in mapping:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
204 temp = {match[dupkey] : match for match in mapping[part]}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
205 if keyToMatch:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
206 dkeys = set(keyToMatch.keys()) - set(temp.keys())
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
207 for k in dkeys:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
208 del keyToMatch[k]
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
209 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
210 keyToMatch = temp
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
211 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
212 if not keyToMatch:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
213 unmatched_peptides.append(peptide)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
214 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
215 for key,match in keyToMatch.iteritems():
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
216 match['tryptic_peptide'] = match['peptide']
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
217 match['peptide'] = peptide
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
218 peptideMatches.append(match)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
219 else:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
220 ## should be one response per trypticPeptide for pep2lca
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
221 respMap = {v['peptide']:v for v in unipept_resp}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
222 ## map resp back to peptides
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
223 for peptide in peptides:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
224 matches = list()
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
225 for part in pepToParts[peptide]:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
226 if part in respMap:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
227 matches.append(respMap[part])
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
228 match = best_match(peptide,matches)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
229 if not match:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
230 unmatched_peptides.append(peptide)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
231 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
232 match = {'peptide' : longest_tryptic_peptide}
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
233 match['tryptic_peptide'] = match['peptide']
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
234 match['peptide'] = peptide
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
235 peptideMatches.append(match)
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
236 resp = peptideMatches
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
237 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
238 ## output results
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
239 if not (options.unmatched or options.json or options.tsv or options.csv):
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
240 print >> sys.stdout, str(resp)
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
241 if options.unmatched:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
242 with open(options.unmatched,'w') as outputFile:
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
243 for peptide in peptides:
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
244 if peptide in unmatched_peptides:
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
245 outputFile.write("%s\n" % peptide)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
246 if options.json:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
247 with open(options.json,'w') as outputFile:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
248 outputFile.write(str(resp))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
249 if options.tsv or options.csv:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
250 # 'pept2lca','pept2taxa','pept2prot'
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
251 found_keys = set()
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
252 results = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
253 for i,pdict in enumerate(resp):
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
254 results.append(pdict)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
255 found_keys |= set(pdict.keys())
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
256 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
257 column_names = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
258 column_keys = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
259 for col in column_order:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
260 if col in found_keys:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
261 column_names.append(col)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
262 column_keys.append(col)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
263 elif options.extra or options.names:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
264 col_id = col+'_id'
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
265 col_name = col+'_name'
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
266 if options.extra:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
267 if col_id in found_keys:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
268 column_names.append(col_id)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
269 column_keys.append(col_id)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
270 if options.names:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
271 if col_name in found_keys:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
272 column_names.append(col)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
273 column_keys.append(col_name)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
274 else:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
275 if col+'_name' in found_keys:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
276 column_names.append(col)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
277 column_keys.append(col+'_name')
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
278 elif col+'_id' in found_keys:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
279 column_names.append(col)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
280 column_keys.append(col+'_id')
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
281 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
282 taxa = []
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
283 for i,pdict in enumerate(results):
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
284 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
1
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
285 if vals not in taxa:
0c1ee95282fa Uploaded
galaxyp
parents: 0
diff changeset
286 taxa.append(vals)
0
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
287 if options.tsv:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
288 with open(options.tsv,'w') as outputFile:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
289 outputFile.write("#%s\n"% '\t'.join(column_names))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
290 for vals in taxa:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
291 outputFile.write("%s\n"% '\t'.join(vals))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
292 if options.csv:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
293 with open(options.csv,'w') as outputFile:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
294 outputFile.write("%s\n"% ','.join(column_names))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
295 for vals in taxa:
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
296 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
297
6430407e5869 Uploaded
galaxyp
parents:
diff changeset
298 if __name__ == "__main__" : __main__()