Mercurial > repos > cpt > cpt_search_file
comparison searchFile.py @ 1:6e3a843b6304 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:53:18 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:bd2ff2c7e806 | 1:6e3a843b6304 |
---|---|
1 ##### User input File(s), that are BLAST XML, gff3, and/or Genbank and then searched for containing user designated terms | |
2 | |
3 import argparse | |
4 import explodeJSON as ej | |
5 import gffutils # THIS IS REQUIREMENT | |
6 from Bio.Blast import NCBIXML | |
7 from Bio import SeqIO | |
8 import re | |
9 import os | |
10 | |
11 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) | |
12 | |
13 ####### TERM FUNCTIONS | |
14 def dbaseTerms(terms, galaxy=True): | |
15 """Index into dictionary object and retrieve all desired terms""" | |
16 db_path = os.path.join(SCRIPT_DIR, "data/lysis-family-v1.0.3.json") | |
17 db = ej.explodeJSON(db_path) | |
18 db = db.readJSON() | |
19 dbase_terms = [] | |
20 if terms: | |
21 for term in terms: | |
22 index_list = term.split(",") | |
23 for t in index_list: | |
24 if t != "None": | |
25 dbase_terms.extend(db[t]) | |
26 else: | |
27 dbase_terms = [] | |
28 return dbase_terms | |
29 else: | |
30 pass | |
31 | |
32 | |
33 def userTerms(file, text): | |
34 """Select terms input by user""" | |
35 user_terms = [] | |
36 if file: | |
37 terms = open(file.name).read().splitlines() | |
38 user_terms.extend(terms) | |
39 else: | |
40 pass | |
41 if text: | |
42 if re.search(("__cn__"), str(text[0])): | |
43 # s = text[0].split("__cn__") | |
44 # print(s) | |
45 # print(text[0]) | |
46 s = text[0] | |
47 # print(type(s)) | |
48 split = s.split("__cn__") | |
49 # print(split) | |
50 user_terms.extend(split) | |
51 else: | |
52 user_terms.extend(text) | |
53 else: | |
54 pass | |
55 | |
56 return user_terms | |
57 | |
58 | |
59 def glueTerms(dbase_terms, user_terms): | |
60 """glue dbaseTerms and userTerms together for eventual query item""" | |
61 glued = [] | |
62 if dbase_terms: | |
63 glued.extend(dbase_terms) | |
64 else: | |
65 pass | |
66 if user_terms: | |
67 glued.extend(user_terms) | |
68 else: | |
69 pass | |
70 | |
71 return glued | |
72 | |
73 | |
74 ####### FILE FUNCTIONS | |
75 def glueFiles(gff, gbk, fa, blast): | |
76 """glue files into one list...I think this is a decent way to go about this...#CHECK LATER#...""" | |
77 files = [] | |
78 gffs = [] | |
79 gbks = [] | |
80 blasts = [] | |
81 if gff: | |
82 for gff_file in gff: | |
83 gffs.extend(gff_file) | |
84 else: | |
85 pass | |
86 if gbk: | |
87 for gbk_file in gbk: | |
88 gbks.extend(gbk_file) | |
89 # print(gbks) | |
90 else: | |
91 pass | |
92 fas = [] | |
93 if fa: | |
94 for fa_file in fa: | |
95 fas.extend(fa_file) | |
96 else: | |
97 pass | |
98 if blast: | |
99 for blast_file in blast: | |
100 blasts.extend(blast_file) | |
101 else: | |
102 pass | |
103 files = [gffs, gbks, fas, blasts] | |
104 | |
105 return files | |
106 | |
107 | |
108 ######## PARSE FILE FUNCTIONS | |
109 def readGFF3(files, search_list): | |
110 "Searches through gff3 file(s) and appends" | |
111 if files: | |
112 for idx, file in enumerate(files): | |
113 if idx == 0: | |
114 print("Parsing - " + file.name) | |
115 db = gffutils.create_db( | |
116 file.name, dbfn="file.db", force=True, keep_order=False | |
117 ) | |
118 db = gffutils.FeatureDB("file.db") | |
119 features = db.all_features() | |
120 gff3_matches = [] | |
121 for feature in features: | |
122 gff3_matches.extend( | |
123 searchInput(str(feature), search_list=search_list) | |
124 ) | |
125 gff3_matches = list( | |
126 set(gff3_matches) | |
127 ) # make sure we don't fluff the list | |
128 else: | |
129 print("Parsing - " + file.name) | |
130 db = gffutils.create_db( | |
131 file.name, dbfn=str(idx) + "_file.db", force=True, keep_order=False | |
132 ) | |
133 db = gffutils.FeatureDB(str(idx) + "_file.db") | |
134 features = db.all_features() | |
135 for feature in features: | |
136 gff3_matches.extend( | |
137 searchInput(str(feature), search_list=search_list) | |
138 ) | |
139 gff3_matches = list( | |
140 set(gff3_matches) | |
141 ) # make sure we don't fluff the list | |
142 gff3_matches.sort() | |
143 return gff3_matches | |
144 else: | |
145 pass | |
146 | |
147 | |
148 def readGBK(files, search_list): | |
149 if files: | |
150 for idx, file in enumerate(files): | |
151 if idx == 0: | |
152 print("Parsing - " + file.name) | |
153 record = SeqIO.read(file.name, "genbank") | |
154 gbk_matches = [] | |
155 for feature in record.features: | |
156 try: | |
157 if ( | |
158 searchInput( | |
159 str(feature.qualifiers["product"]), | |
160 search_list=search_list, | |
161 ) | |
162 or searchInput( | |
163 str(feature.qualifiers["note"]), search_list=search_list | |
164 ) | |
165 or searchInput( | |
166 str(feature.qualifiers["dbxref"]), | |
167 search_list=search_list, | |
168 ) | |
169 ): | |
170 gbk_matches.extend([str(feature)]) | |
171 else: | |
172 continue | |
173 except KeyError: | |
174 continue | |
175 gbk_matches = list(set(gbk_matches)) | |
176 else: | |
177 print("Parsing - " + file.name) | |
178 record = SeqIO.read(file.name, "genbank") | |
179 for feature in record.features: | |
180 try: | |
181 if ( | |
182 searchInput( | |
183 str(feature.qualifiers["product"]), | |
184 search_list=search_list, | |
185 ) | |
186 or searchInput( | |
187 str(feature.qualifiers["note"]), search_list=search_list | |
188 ) | |
189 or searchInput( | |
190 str(feature.qualifiers["dbxref"]), | |
191 search_list=search_list, | |
192 ) | |
193 ): | |
194 gbk_matches.extend([str(feature)]) | |
195 else: | |
196 continue | |
197 except KeyError: | |
198 continue | |
199 gbk_matches = list(set(gbk_matches)) | |
200 gbk_matches.sort() | |
201 return gbk_matches | |
202 else: | |
203 pass | |
204 | |
205 | |
206 def readFASTA(files, search_list): | |
207 if files: | |
208 for idx, file in enumerate(files): | |
209 if idx == 0: | |
210 print("Parsing - " + file.name) | |
211 record = SeqIO.parse(file.name, "fasta") | |
212 fa_matches = [] | |
213 for feature in record: | |
214 fa_matches.extend( | |
215 searchInput(feature.description, search_list=search_list) | |
216 ) | |
217 fa_matches = list(set(fa_matches)) | |
218 else: | |
219 print("Parsing - " + file.name) | |
220 record = SeqIO.parse(file.name, "fasta") | |
221 for feature in record: | |
222 fa_matches.extend( | |
223 searchInput(feature.description, search_list=search_list) | |
224 ) | |
225 fa_matches = list(set(fa_matches)) | |
226 fa_matches.sort() | |
227 return fa_matches | |
228 else: | |
229 pass | |
230 | |
231 | |
232 def readBLAST(files, search_list): | |
233 if files: | |
234 for idx, file in enumerate(files): | |
235 if idx == 0: | |
236 print("Parsing - " + file.name) | |
237 blast_records = NCBIXML.parse(open(file.name)) | |
238 blast_matches = [] | |
239 for blast_record in blast_records: | |
240 for desc in blast_record.descriptions: | |
241 pretty = prettifyXML(str(desc)) | |
242 for each_ret in pretty: | |
243 blast_matches.extend( | |
244 searchInput( | |
245 each_ret, | |
246 search_list=search_list, | |
247 blast=True, | |
248 q_id=blast_record.query, | |
249 ) | |
250 ) | |
251 blast_matches = list(set(blast_matches)) | |
252 else: | |
253 print("Parsing - " + file.name) | |
254 blast_records = NCBIXML.parse(open(file.name)) | |
255 for blast_record in blast_records: | |
256 for desc in blast_record.descriptions: | |
257 pretty = prettifyXML(str(desc)) | |
258 blast_matches.extend( | |
259 searchInput( | |
260 each_ret, | |
261 search_list=search_list, | |
262 blast=True, | |
263 q_id=blast_record.query, | |
264 ) | |
265 ) | |
266 blast_matches = list(set(blast_matches)) | |
267 blast_matches.sort() | |
268 return blast_matches | |
269 else: | |
270 pass | |
271 | |
272 | |
273 ######## SEARCH FILE FUNCTIONS | |
274 def searchInput(input, search_list, blast=False, q_id=None): | |
275 """Takes an input search string, and returns uniques of passing""" | |
276 output = [] | |
277 for search_term in search_list: | |
278 if blast: | |
279 if re.search(re.escape(search_term), input): | |
280 add_query = ( | |
281 "QueryID: " | |
282 + str(q_id) | |
283 + "\nSearchQuery: " | |
284 + search_term | |
285 + "\nMatch: " | |
286 + input | |
287 + "\n" | |
288 ) | |
289 output.extend([add_query]) | |
290 else: | |
291 continue | |
292 # print(search_term) | |
293 # st = r"\b"+search_term+r"\b" | |
294 else: | |
295 if re.search(re.escape(search_term), input): | |
296 # print(search_term+" -> was found") | |
297 output.extend([input]) | |
298 else: | |
299 continue | |
300 return list(set(output)) | |
301 | |
302 | |
303 ######## prettify-XML function | |
304 def prettifyXML(input): | |
305 """prettifies a string input from a BLAST-xml""" | |
306 s = input | |
307 split = s.split(">") | |
308 | |
309 return split | |
310 | |
311 | |
312 ########## Output File Writer | |
313 def writeResults(gffs, gbks, fas, blasts, outName="termHits.txt"): | |
314 """Takes an input list for each parameter, and writes each result to the output file""" | |
315 | |
316 with open(outName.name, "w+") as out_file: | |
317 if gffs: | |
318 | |
319 out_file.writelines( | |
320 "\n==================== GFF3 Term Hits ====================\n\n" | |
321 ) | |
322 for gff_hits in gffs: | |
323 out_file.writelines(gff_hits + "\n") | |
324 else: | |
325 gffs = [] | |
326 if gbks: | |
327 out_file.writelines( | |
328 "\n==================== GBK Term Hits ====================\n\n" | |
329 ) | |
330 for gbk_hits in gbks: | |
331 out_file.writelines(gbk_hits + "\n") | |
332 else: | |
333 gbks = [] | |
334 if fas: | |
335 | |
336 out_file.writelines( | |
337 "\n==================== FASTA Term Hits ====================\n\n" | |
338 ) | |
339 for fa_hits in fas: | |
340 out_file.writelines(fa_hits + "\n") | |
341 else: | |
342 fas = [] | |
343 if blasts: | |
344 | |
345 out_file.writelines( | |
346 "\n==================== BLAST Term Hits ====================\n\n" | |
347 ) | |
348 for blast_hits in blasts: | |
349 out_file.writelines(blast_hits + "\n") | |
350 else: | |
351 blasts = [] | |
352 if len(gffs) or len(gbks) or len(fas) or len(blasts): | |
353 print("Terms Found") | |
354 else: | |
355 out_file.writelines("No query matches, try again with new terms!") | |
356 print("No query matches, try again with new terms!") | |
357 | |
358 | |
359 def write_gff3(gffs, outName="proxHits.gff3"): | |
360 """writes output to gff3 file for prox2lysis pipeline""" | |
361 | |
362 with open(outName.name, "w+") as out_file: | |
363 out_file.writelines("##gff-version 3\n") | |
364 if gffs: | |
365 for gff_hits in gffs: | |
366 out_file.writelines(gff_hits + "\n") | |
367 else: | |
368 # raise Exception("No terms were found from query set") | |
369 out_file.writelines("##No terms were found from query set\n") | |
370 | |
371 | |
372 if __name__ == "__main__": | |
373 print(os.getcwd()) | |
374 parser = argparse.ArgumentParser( | |
375 description="Uses a selection of terms to query an input file for matching cases" | |
376 ) | |
377 parser.add_argument( | |
378 "--dbaseTerms", nargs="*", help="dbase terms to search" | |
379 ) # will be a select option, based on KEY within the JSON dbase | |
380 parser.add_argument( | |
381 "--custom_txt", | |
382 nargs="*", | |
383 help="custom user input terms, if using Galaxy, terms will be __cn__ sep, otherwise by space", | |
384 ) | |
385 parser.add_argument( | |
386 "--custom_file", | |
387 type=argparse.FileType("r"), | |
388 help="custom new line separated search term file", | |
389 ) | |
390 parser.add_argument( | |
391 "--gff3_files", | |
392 type=argparse.FileType("r"), | |
393 nargs="*", | |
394 action="append", | |
395 help="GFF3 File(s), if multiple files, use another flag", | |
396 ) | |
397 parser.add_argument( | |
398 "--gbk_files", | |
399 type=argparse.FileType("r"), | |
400 nargs="*", | |
401 action="append", | |
402 help="GBK File(s), if multiple files, use another flag", | |
403 ) | |
404 parser.add_argument( | |
405 "--fa_files", | |
406 type=argparse.FileType("r"), | |
407 nargs="*", | |
408 action="append", | |
409 help="FASTA File(s), if multiple files, use another flag", | |
410 ) | |
411 parser.add_argument( | |
412 "--blast_files", | |
413 type=argparse.FileType("r"), | |
414 nargs="*", | |
415 action="append", | |
416 help="BLAST.xml File(s), if multiple files, use another flag", | |
417 ) | |
418 parser.add_argument( | |
419 "--output", type=argparse.FileType("w+"), default="termHits.txt" | |
420 ) | |
421 parser.add_argument( | |
422 "--prox", action="store_true", help="Use when running the prox2lysis pipeline" | |
423 ) | |
424 args = parser.parse_args() | |
425 | |
426 ############ STEP I | |
427 ##### Determine user's terms to query | |
428 dbase_terms = dbaseTerms(terms=args.dbaseTerms, galaxy=True) | |
429 user_terms = userTerms(file=args.custom_file, text=args.custom_txt) | |
430 glued_terms = glueTerms(dbase_terms=dbase_terms, user_terms=user_terms) | |
431 | |
432 ############ STEP II | |
433 ##### Create list with matches | |
434 files = glueFiles( | |
435 gff=args.gff3_files, | |
436 gbk=args.gbk_files, | |
437 fa=args.fa_files, | |
438 blast=args.blast_files, | |
439 ) | |
440 gffs = readGFF3(files=files[0], search_list=glued_terms) | |
441 gbks = readGBK(files=files[1], search_list=glued_terms) | |
442 fas = readFASTA(files=files[2], search_list=glued_terms) | |
443 blasts = readBLAST(files=files[3], search_list=glued_terms) | |
444 | |
445 ############ STEP III | |
446 ##### Output results to a text file or gff3 | |
447 if args.prox: | |
448 write_gff3(gffs, outName=args.output) | |
449 else: | |
450 writeResults(gffs, gbks, fas, blasts, outName=args.output) |