comparison unipept.py @ 1:0c1ee95282fa draft

Uploaded
author galaxyp
date Tue, 14 Apr 2015 16:44:22 -0400
parents 6430407e5869
children 34758ab8aaa4
comparison
equal deleted inserted replaced
0:6430407e5869 1:0c1ee95282fa
29 def warn_err(msg,exit_code=1): 29 def warn_err(msg,exit_code=1):
30 sys.stderr.write(msg) 30 sys.stderr.write(msg)
31 if exit_code: 31 if exit_code:
32 sys.exit(exit_code) 32 sys.exit(exit_code)
33 33
34 def read_fasta(fp): 34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
39
40 def __main__():
41 version = '1.1'
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
43
44 def read_tabular(filepath,col):
45 peptides = []
46 with open(filepath) as fp:
47 for i,line in enumerate(fp):
48 if line.strip() == '' or line.startswith('#'):
49 continue
50 fields = line.rstrip('\n').split('\t')
51 peptide = fields[col]
52 if not re.match(pep_pat,peptide):
53 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
54 peptides.append(peptide)
55 return peptides
56
57 def get_fasta_entries(fp):
35 name, seq = None, [] 58 name, seq = None, []
36 for line in fp: 59 for line in fp:
37 line = line.rstrip() 60 line = line.rstrip()
38 if line.startswith(">"): 61 if line.startswith(">"):
39 if name: yield (name, ''.join(seq)) 62 if name: yield (name, ''.join(seq))
40 name, seq = line, [] 63 name, seq = line, []
41 else: 64 else:
42 seq.append(line) 65 seq.append(line)
43 if name: yield (name, ''.join(seq)) 66 if name: yield (name, ''.join(seq))
44 67
45 def read_mzid(fp): 68 def read_fasta(filepath):
46 peptides = [] 69 peptides = []
47 for event, elem in ET.iterparse(fp): 70 with open(filepath) as fp:
48 if event == 'end': 71 for id, peptide in get_fasta_entries(fp):
49 if re.search('PeptideSequence',elem.tag): 72 if not re.match(pep_pat,peptide):
50 peptides.append(elem.text) 73 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
51 return peptides 74 peptides.append(peptide)
52 75 return peptides
53 def read_pepxml(fp): 76
54 peptides = [] 77 def read_mzid(fp):
55 for event, elem in ET.iterparse(fp): 78 peptides = []
56 if event == 'end': 79 for event, elem in ET.iterparse(fp):
57 if re.search('search_hit',elem.tag): 80 if event == 'end':
58 peptides.append(elem.get('peptide')) 81 if re.search('PeptideSequence',elem.tag):
59 return peptides 82 peptides.append(elem.text)
60 83 return peptides
61 def __main__(): 84
85 def read_pepxml(fp):
86 peptides = []
87 for event, elem in ET.iterparse(fp):
88 if event == 'end':
89 if re.search('search_hit',elem.tag):
90 peptides.append(elem.get('peptide'))
91 return peptides
92
93 def best_match(peptide,matches):
94 if not matches:
95 return None
96 elif len(matches) == 1:
97 return matches[0].copy()
98 else:
99 # find the most specific match (peptide is always the first column order field)
100 for col in reversed(pept2lca_extra_column_order[1:]):
101 col_id = col+"_id" if options.extra else col
102 for match in matches:
103 if 'taxon_rank' in match and match['taxon_rank'] == col:
104 return match.copy()
105 if col_id in match and match[col_id]:
106 return match.copy()
107 return None
108
62 #Parse Command Line 109 #Parse Command Line
63 parser = optparse.OptionParser() 110 parser = optparse.OptionParser()
64 # unipept API 111 # unipept API choice
65 parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) 112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
66 # files 113 # input files
67 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) 114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
68 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) 115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
69 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) 116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
70 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) 117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
71 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) 118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
72 # Unipept Flags 119 # Unipept Flags
73 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) 120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
74 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) 121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
75 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) 122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
123 # output fields
124 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
76 # Warn vs Error Flag 125 # Warn vs Error Flag
77 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) 126 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
78 # outputs 127 # output files
79 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') 128 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
80 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') 129 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
81 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') 130 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
82 parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' ) 131 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
132 # debug
133 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
134 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
83 (options, args) = parser.parse_args() 135 (options, args) = parser.parse_args()
136 if options.version:
137 print >> sys.stdout,"%s" % version
138 sys.exit(0)
84 invalid_ec = 2 if options.strict else None 139 invalid_ec = 2 if options.strict else None
85 peptides = [] 140 peptides = []
86 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
87 ## Get peptide sequences 141 ## Get peptide sequences
88 if options.mzid: 142 if options.mzid:
89 peptides += read_mzid(options.mzid) 143 peptides += read_mzid(options.mzid)
90 if options.pepxml: 144 if options.pepxml:
91 peptides += read_pepxml(options.pepxml) 145 peptides += read_pepxml(options.pepxml)
92 if options.tabular: 146 if options.tabular:
93 with open(options.tabular) as fp: 147 peptides += read_tabular(options.tabular,options.column)
94 for i,line in enumerate(fp):
95 if line.strip() == '' or line.startswith('#'):
96 continue
97 fields = line.rstrip('\n').split('\t')
98 peptide = fields[options.column]
99 if not re.match(pep_pat,peptide):
100 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec)
101 peptides.append(peptide)
102 if options.fasta: 148 if options.fasta:
103 with open(options.fasta) as fp: 149 peptides += read_fasta(options.fasta)
104 for id, peptide in read_fasta(fp):
105 if not re.match(pep_pat,peptide):
106 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec)
107 peptides.append(peptide)
108 if args and len(args) > 0: 150 if args and len(args) > 0:
109 for i,peptide in enumerate(args): 151 for i,peptide in enumerate(args):
110 if not re.match(pep_pat,peptide): 152 if not re.match(pep_pat,peptide):
111 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) 153 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
112 peptides.append(peptide) 154 peptides.append(peptide)
113 if len(peptides) < 1: 155 if len(peptides) < 1:
114 warn_err("No peptides input!",exit_code=1) 156 warn_err("No peptides input!",exit_code=1)
157 column_order = pept2lca_column_order
158 if options.unipept == 'pept2prot':
159 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
160 else:
161 if options.extra or options.names:
162 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
163 else:
164 column_order = pept2lca_column_order
165 ## map to tryptic peptides
166 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
167 partToPeps = {}
168 for peptide, parts in pepToParts.iteritems():
169 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
170 for part in parts:
171 if len(part) > 50:
172 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
173 if 5 <= len(part) <= 50:
174 partToPeps.setdefault(part,[]).append(peptide)
175 trypticPeptides = partToPeps.keys()
115 ## unipept 176 ## unipept
116 post_data = [] 177 post_data = []
117 if options.equate_il: 178 if options.equate_il:
118 post_data.append(("equate_il","true")) 179 post_data.append(("equate_il","true"))
119 if options.names: 180 if options.names:
120 post_data.append(("extra","true")) 181 post_data.append(("extra","true"))
121 post_data.append(("names","true")) 182 post_data.append(("names","true"))
122 elif options.extra: 183 elif options.extra:
123 post_data.append(("extra","true")) 184 post_data.append(("extra","true"))
124 post_data += [('input[]', x) for x in peptides] 185 post_data += [('input[]', x) for x in trypticPeptides]
125 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} 186 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
126 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept 187 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
127 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) 188 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
128 resp = json.loads( urllib2.urlopen( req ).read() ) 189 unipept_resp = json.loads( urllib2.urlopen( req ).read() )
190 unmatched_peptides = []
191 peptideMatches = []
192 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
193 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
194 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
195 ## multiple entries per trypticPeptide for pep2prot or pep2taxa
196 mapping = {}
197 for match in unipept_resp:
198 mapping.setdefault(match['peptide'],[]).append(match)
199 for peptide in peptides:
200 # Get the intersection of matches to the tryptic parts
201 keyToMatch = None
202 for part in pepToParts[peptide]:
203 if part in mapping:
204 temp = {match[dupkey] : match for match in mapping[part]}
205 if keyToMatch:
206 dkeys = set(keyToMatch.keys()) - set(temp.keys())
207 for k in dkeys:
208 del keyToMatch[k]
209 else:
210 keyToMatch = temp
211 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
212 if not keyToMatch:
213 unmatched_peptides.append(peptide)
214 else:
215 for key,match in keyToMatch.iteritems():
216 match['tryptic_peptide'] = match['peptide']
217 match['peptide'] = peptide
218 peptideMatches.append(match)
219 else:
220 ## should be one response per trypticPeptide for pep2lca
221 respMap = {v['peptide']:v for v in unipept_resp}
222 ## map resp back to peptides
223 for peptide in peptides:
224 matches = list()
225 for part in pepToParts[peptide]:
226 if part in respMap:
227 matches.append(respMap[part])
228 match = best_match(peptide,matches)
229 if not match:
230 unmatched_peptides.append(peptide)
231 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
232 match = {'peptide' : longest_tryptic_peptide}
233 match['tryptic_peptide'] = match['peptide']
234 match['peptide'] = peptide
235 peptideMatches.append(match)
236 resp = peptideMatches
237 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
129 ## output results 238 ## output results
130 if not (options.mismatch or options.json or options.tsv or options.csv): 239 if not (options.unmatched or options.json or options.tsv or options.csv):
131 print >> sys.stdout, str(resp) 240 print >> sys.stdout, str(resp)
132 if options.mismatch: 241 if options.unmatched:
133 peptides_matched = [] 242 with open(options.unmatched,'w') as outputFile:
134 for i,pdict in enumerate(resp):
135 peptides_matched.append(pdict['peptide'])
136 with open(options.mismatch,'w') as outputFile:
137 for peptide in peptides: 243 for peptide in peptides:
138 if not peptide in peptides_matched: 244 if peptide in unmatched_peptides:
139 outputFile.write("%s\n" % peptide) 245 outputFile.write("%s\n" % peptide)
140 if options.json: 246 if options.json:
141 with open(options.json,'w') as outputFile: 247 with open(options.json,'w') as outputFile:
142 outputFile.write(str(resp)) 248 outputFile.write(str(resp))
143 if options.tsv or options.csv: 249 if options.tsv or options.csv:
144 # 'pept2lca','pept2taxa','pept2prot' 250 # 'pept2lca','pept2taxa','pept2prot'
145 pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
146 pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
147 column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order
148 found_keys = set() 251 found_keys = set()
149 results = [] 252 results = []
150 for i,pdict in enumerate(resp): 253 for i,pdict in enumerate(resp):
151 results.append(pdict) 254 results.append(pdict)
152 found_keys |= set(pdict.keys()) 255 found_keys |= set(pdict.keys())
177 column_keys.append(col+'_id') 280 column_keys.append(col+'_id')
178 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) 281 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
179 taxa = [] 282 taxa = []
180 for i,pdict in enumerate(results): 283 for i,pdict in enumerate(results):
181 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] 284 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
182 taxa.append(vals) 285 if vals not in taxa:
286 taxa.append(vals)
183 if options.tsv: 287 if options.tsv:
184 with open(options.tsv,'w') as outputFile: 288 with open(options.tsv,'w') as outputFile:
185 outputFile.write("#%s\n"% '\t'.join(column_names)) 289 outputFile.write("#%s\n"% '\t'.join(column_names))
186 for vals in taxa: 290 for vals in taxa:
187 outputFile.write("%s\n"% '\t'.join(vals)) 291 outputFile.write("%s\n"% '\t'.join(vals))