Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 1:0c1ee95282fa draft
Uploaded
author | galaxyp |
---|---|
date | Tue, 14 Apr 2015 16:44:22 -0400 |
parents | 6430407e5869 |
children | 34758ab8aaa4 |
comparison
equal
deleted
inserted
replaced
0:6430407e5869 | 1:0c1ee95282fa |
---|---|
29 def warn_err(msg,exit_code=1): | 29 def warn_err(msg,exit_code=1): |
30 sys.stderr.write(msg) | 30 sys.stderr.write(msg) |
31 if exit_code: | 31 if exit_code: |
32 sys.exit(exit_code) | 32 sys.exit(exit_code) |
33 | 33 |
34 def read_fasta(fp): | 34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] |
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | |
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | |
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | |
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | |
39 | |
40 def __main__(): | |
41 version = '1.1' | |
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | |
43 | |
44 def read_tabular(filepath,col): | |
45 peptides = [] | |
46 with open(filepath) as fp: | |
47 for i,line in enumerate(fp): | |
48 if line.strip() == '' or line.startswith('#'): | |
49 continue | |
50 fields = line.rstrip('\n').split('\t') | |
51 peptide = fields[col] | |
52 if not re.match(pep_pat,peptide): | |
53 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) | |
54 peptides.append(peptide) | |
55 return peptides | |
56 | |
57 def get_fasta_entries(fp): | |
35 name, seq = None, [] | 58 name, seq = None, [] |
36 for line in fp: | 59 for line in fp: |
37 line = line.rstrip() | 60 line = line.rstrip() |
38 if line.startswith(">"): | 61 if line.startswith(">"): |
39 if name: yield (name, ''.join(seq)) | 62 if name: yield (name, ''.join(seq)) |
40 name, seq = line, [] | 63 name, seq = line, [] |
41 else: | 64 else: |
42 seq.append(line) | 65 seq.append(line) |
43 if name: yield (name, ''.join(seq)) | 66 if name: yield (name, ''.join(seq)) |
44 | 67 |
45 def read_mzid(fp): | 68 def read_fasta(filepath): |
46 peptides = [] | 69 peptides = [] |
47 for event, elem in ET.iterparse(fp): | 70 with open(filepath) as fp: |
48 if event == 'end': | 71 for id, peptide in get_fasta_entries(fp): |
49 if re.search('PeptideSequence',elem.tag): | 72 if not re.match(pep_pat,peptide): |
50 peptides.append(elem.text) | 73 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) |
51 return peptides | 74 peptides.append(peptide) |
52 | 75 return peptides |
53 def read_pepxml(fp): | 76 |
54 peptides = [] | 77 def read_mzid(fp): |
55 for event, elem in ET.iterparse(fp): | 78 peptides = [] |
56 if event == 'end': | 79 for event, elem in ET.iterparse(fp): |
57 if re.search('search_hit',elem.tag): | 80 if event == 'end': |
58 peptides.append(elem.get('peptide')) | 81 if re.search('PeptideSequence',elem.tag): |
59 return peptides | 82 peptides.append(elem.text) |
60 | 83 return peptides |
61 def __main__(): | 84 |
85 def read_pepxml(fp): | |
86 peptides = [] | |
87 for event, elem in ET.iterparse(fp): | |
88 if event == 'end': | |
89 if re.search('search_hit',elem.tag): | |
90 peptides.append(elem.get('peptide')) | |
91 return peptides | |
92 | |
93 def best_match(peptide,matches): | |
94 if not matches: | |
95 return None | |
96 elif len(matches) == 1: | |
97 return matches[0].copy() | |
98 else: | |
99 # find the most specific match (peptide is always the first column order field) | |
100 for col in reversed(pept2lca_extra_column_order[1:]): | |
101 col_id = col+"_id" if options.extra else col | |
102 for match in matches: | |
103 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
104 return match.copy() | |
105 if col_id in match and match[col_id]: | |
106 return match.copy() | |
107 return None | |
108 | |
62 #Parse Command Line | 109 #Parse Command Line |
63 parser = optparse.OptionParser() | 110 parser = optparse.OptionParser() |
64 # unipept API | 111 # unipept API choice |
65 parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) | 112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) |
66 # files | 113 # input files |
67 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | 114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) |
68 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | 115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) |
69 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | 116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) |
70 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | 117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) |
71 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) |
72 # Unipept Flags | 119 # Unipept Flags |
73 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | 120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) |
74 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | 121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) |
75 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | 122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) |
123 # output fields | |
124 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | |
76 # Warn vs Error Flag | 125 # Warn vs Error Flag |
77 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | 126 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) |
78 # outputs | 127 # output files |
79 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | 128 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') |
80 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | 129 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') |
81 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | 130 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') |
82 parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' ) | 131 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) |
132 # debug | |
133 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | |
134 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | |
83 (options, args) = parser.parse_args() | 135 (options, args) = parser.parse_args() |
136 if options.version: | |
137 print >> sys.stdout,"%s" % version | |
138 sys.exit(0) | |
84 invalid_ec = 2 if options.strict else None | 139 invalid_ec = 2 if options.strict else None |
85 peptides = [] | 140 peptides = [] |
86 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | |
87 ## Get peptide sequences | 141 ## Get peptide sequences |
88 if options.mzid: | 142 if options.mzid: |
89 peptides += read_mzid(options.mzid) | 143 peptides += read_mzid(options.mzid) |
90 if options.pepxml: | 144 if options.pepxml: |
91 peptides += read_pepxml(options.pepxml) | 145 peptides += read_pepxml(options.pepxml) |
92 if options.tabular: | 146 if options.tabular: |
93 with open(options.tabular) as fp: | 147 peptides += read_tabular(options.tabular,options.column) |
94 for i,line in enumerate(fp): | |
95 if line.strip() == '' or line.startswith('#'): | |
96 continue | |
97 fields = line.rstrip('\n').split('\t') | |
98 peptide = fields[options.column] | |
99 if not re.match(pep_pat,peptide): | |
100 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec) | |
101 peptides.append(peptide) | |
102 if options.fasta: | 148 if options.fasta: |
103 with open(options.fasta) as fp: | 149 peptides += read_fasta(options.fasta) |
104 for id, peptide in read_fasta(fp): | |
105 if not re.match(pep_pat,peptide): | |
106 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec) | |
107 peptides.append(peptide) | |
108 if args and len(args) > 0: | 150 if args and len(args) > 0: |
109 for i,peptide in enumerate(args): | 151 for i,peptide in enumerate(args): |
110 if not re.match(pep_pat,peptide): | 152 if not re.match(pep_pat,peptide): |
111 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | 153 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) |
112 peptides.append(peptide) | 154 peptides.append(peptide) |
113 if len(peptides) < 1: | 155 if len(peptides) < 1: |
114 warn_err("No peptides input!",exit_code=1) | 156 warn_err("No peptides input!",exit_code=1) |
157 column_order = pept2lca_column_order | |
158 if options.unipept == 'pept2prot': | |
159 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order | |
160 else: | |
161 if options.extra or options.names: | |
162 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | |
163 else: | |
164 column_order = pept2lca_column_order | |
165 ## map to tryptic peptides | |
166 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} | |
167 partToPeps = {} | |
168 for peptide, parts in pepToParts.iteritems(): | |
169 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) | |
170 for part in parts: | |
171 if len(part) > 50: | |
172 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | |
173 if 5 <= len(part) <= 50: | |
174 partToPeps.setdefault(part,[]).append(peptide) | |
175 trypticPeptides = partToPeps.keys() | |
115 ## unipept | 176 ## unipept |
116 post_data = [] | 177 post_data = [] |
117 if options.equate_il: | 178 if options.equate_il: |
118 post_data.append(("equate_il","true")) | 179 post_data.append(("equate_il","true")) |
119 if options.names: | 180 if options.names: |
120 post_data.append(("extra","true")) | 181 post_data.append(("extra","true")) |
121 post_data.append(("names","true")) | 182 post_data.append(("names","true")) |
122 elif options.extra: | 183 elif options.extra: |
123 post_data.append(("extra","true")) | 184 post_data.append(("extra","true")) |
124 post_data += [('input[]', x) for x in peptides] | 185 post_data += [('input[]', x) for x in trypticPeptides] |
125 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | 186 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} |
126 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept | 187 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept |
127 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | 188 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) |
128 resp = json.loads( urllib2.urlopen( req ).read() ) | 189 unipept_resp = json.loads( urllib2.urlopen( req ).read() ) |
190 unmatched_peptides = [] | |
191 peptideMatches = [] | |
192 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | |
193 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': | |
194 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | |
195 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | |
196 mapping = {} | |
197 for match in unipept_resp: | |
198 mapping.setdefault(match['peptide'],[]).append(match) | |
199 for peptide in peptides: | |
200 # Get the intersection of matches to the tryptic parts | |
201 keyToMatch = None | |
202 for part in pepToParts[peptide]: | |
203 if part in mapping: | |
204 temp = {match[dupkey] : match for match in mapping[part]} | |
205 if keyToMatch: | |
206 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | |
207 for k in dkeys: | |
208 del keyToMatch[k] | |
209 else: | |
210 keyToMatch = temp | |
211 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | |
212 if not keyToMatch: | |
213 unmatched_peptides.append(peptide) | |
214 else: | |
215 for key,match in keyToMatch.iteritems(): | |
216 match['tryptic_peptide'] = match['peptide'] | |
217 match['peptide'] = peptide | |
218 peptideMatches.append(match) | |
219 else: | |
220 ## should be one response per trypticPeptide for pep2lca | |
221 respMap = {v['peptide']:v for v in unipept_resp} | |
222 ## map resp back to peptides | |
223 for peptide in peptides: | |
224 matches = list() | |
225 for part in pepToParts[peptide]: | |
226 if part in respMap: | |
227 matches.append(respMap[part]) | |
228 match = best_match(peptide,matches) | |
229 if not match: | |
230 unmatched_peptides.append(peptide) | |
231 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
232 match = {'peptide' : longest_tryptic_peptide} | |
233 match['tryptic_peptide'] = match['peptide'] | |
234 match['peptide'] = peptide | |
235 peptideMatches.append(match) | |
236 resp = peptideMatches | |
237 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) | |
129 ## output results | 238 ## output results |
130 if not (options.mismatch or options.json or options.tsv or options.csv): | 239 if not (options.unmatched or options.json or options.tsv or options.csv): |
131 print >> sys.stdout, str(resp) | 240 print >> sys.stdout, str(resp) |
132 if options.mismatch: | 241 if options.unmatched: |
133 peptides_matched = [] | 242 with open(options.unmatched,'w') as outputFile: |
134 for i,pdict in enumerate(resp): | |
135 peptides_matched.append(pdict['peptide']) | |
136 with open(options.mismatch,'w') as outputFile: | |
137 for peptide in peptides: | 243 for peptide in peptides: |
138 if not peptide in peptides_matched: | 244 if peptide in unmatched_peptides: |
139 outputFile.write("%s\n" % peptide) | 245 outputFile.write("%s\n" % peptide) |
140 if options.json: | 246 if options.json: |
141 with open(options.json,'w') as outputFile: | 247 with open(options.json,'w') as outputFile: |
142 outputFile.write(str(resp)) | 248 outputFile.write(str(resp)) |
143 if options.tsv or options.csv: | 249 if options.tsv or options.csv: |
144 # 'pept2lca','pept2taxa','pept2prot' | 250 # 'pept2lca','pept2taxa','pept2prot' |
145 pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | |
146 pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | |
147 column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order | |
148 found_keys = set() | 251 found_keys = set() |
149 results = [] | 252 results = [] |
150 for i,pdict in enumerate(resp): | 253 for i,pdict in enumerate(resp): |
151 results.append(pdict) | 254 results.append(pdict) |
152 found_keys |= set(pdict.keys()) | 255 found_keys |= set(pdict.keys()) |
177 column_keys.append(col+'_id') | 280 column_keys.append(col+'_id') |
178 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | 281 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) |
179 taxa = [] | 282 taxa = [] |
180 for i,pdict in enumerate(results): | 283 for i,pdict in enumerate(results): |
181 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | 284 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] |
182 taxa.append(vals) | 285 if vals not in taxa: |
286 taxa.append(vals) | |
183 if options.tsv: | 287 if options.tsv: |
184 with open(options.tsv,'w') as outputFile: | 288 with open(options.tsv,'w') as outputFile: |
185 outputFile.write("#%s\n"% '\t'.join(column_names)) | 289 outputFile.write("#%s\n"% '\t'.join(column_names)) |
186 for vals in taxa: | 290 for vals in taxa: |
187 outputFile.write("%s\n"% '\t'.join(vals)) | 291 outputFile.write("%s\n"% '\t'.join(vals)) |