comparison searchFile.py @ 1:6e3a843b6304 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:53:18 +0000
parents
children
comparison
equal deleted inserted replaced
0:bd2ff2c7e806 1:6e3a843b6304
1 ##### User input File(s), that are BLAST XML, gff3, and/or Genbank and then searched for containing user designated terms
2
3 import argparse
4 import explodeJSON as ej
5 import gffutils # THIS IS REQUIREMENT
6 from Bio.Blast import NCBIXML
7 from Bio import SeqIO
8 import re
9 import os
10
11 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
12
13 ####### TERM FUNCTIONS
14 def dbaseTerms(terms, galaxy=True):
15 """Index into dictionary object and retrieve all desired terms"""
16 db_path = os.path.join(SCRIPT_DIR, "data/lysis-family-v1.0.3.json")
17 db = ej.explodeJSON(db_path)
18 db = db.readJSON()
19 dbase_terms = []
20 if terms:
21 for term in terms:
22 index_list = term.split(",")
23 for t in index_list:
24 if t != "None":
25 dbase_terms.extend(db[t])
26 else:
27 dbase_terms = []
28 return dbase_terms
29 else:
30 pass
31
32
33 def userTerms(file, text):
34 """Select terms input by user"""
35 user_terms = []
36 if file:
37 terms = open(file.name).read().splitlines()
38 user_terms.extend(terms)
39 else:
40 pass
41 if text:
42 if re.search(("__cn__"), str(text[0])):
43 # s = text[0].split("__cn__")
44 # print(s)
45 # print(text[0])
46 s = text[0]
47 # print(type(s))
48 split = s.split("__cn__")
49 # print(split)
50 user_terms.extend(split)
51 else:
52 user_terms.extend(text)
53 else:
54 pass
55
56 return user_terms
57
58
59 def glueTerms(dbase_terms, user_terms):
60 """glue dbaseTerms and userTerms together for eventual query item"""
61 glued = []
62 if dbase_terms:
63 glued.extend(dbase_terms)
64 else:
65 pass
66 if user_terms:
67 glued.extend(user_terms)
68 else:
69 pass
70
71 return glued
72
73
74 ####### FILE FUNCTIONS
75 def glueFiles(gff, gbk, fa, blast):
76 """glue files into one list...I think this is a decent way to go about this...#CHECK LATER#..."""
77 files = []
78 gffs = []
79 gbks = []
80 blasts = []
81 if gff:
82 for gff_file in gff:
83 gffs.extend(gff_file)
84 else:
85 pass
86 if gbk:
87 for gbk_file in gbk:
88 gbks.extend(gbk_file)
89 # print(gbks)
90 else:
91 pass
92 fas = []
93 if fa:
94 for fa_file in fa:
95 fas.extend(fa_file)
96 else:
97 pass
98 if blast:
99 for blast_file in blast:
100 blasts.extend(blast_file)
101 else:
102 pass
103 files = [gffs, gbks, fas, blasts]
104
105 return files
106
107
108 ######## PARSE FILE FUNCTIONS
109 def readGFF3(files, search_list):
110 "Searches through gff3 file(s) and appends"
111 if files:
112 for idx, file in enumerate(files):
113 if idx == 0:
114 print("Parsing - " + file.name)
115 db = gffutils.create_db(
116 file.name, dbfn="file.db", force=True, keep_order=False
117 )
118 db = gffutils.FeatureDB("file.db")
119 features = db.all_features()
120 gff3_matches = []
121 for feature in features:
122 gff3_matches.extend(
123 searchInput(str(feature), search_list=search_list)
124 )
125 gff3_matches = list(
126 set(gff3_matches)
127 ) # make sure we don't fluff the list
128 else:
129 print("Parsing - " + file.name)
130 db = gffutils.create_db(
131 file.name, dbfn=str(idx) + "_file.db", force=True, keep_order=False
132 )
133 db = gffutils.FeatureDB(str(idx) + "_file.db")
134 features = db.all_features()
135 for feature in features:
136 gff3_matches.extend(
137 searchInput(str(feature), search_list=search_list)
138 )
139 gff3_matches = list(
140 set(gff3_matches)
141 ) # make sure we don't fluff the list
142 gff3_matches.sort()
143 return gff3_matches
144 else:
145 pass
146
147
148 def readGBK(files, search_list):
149 if files:
150 for idx, file in enumerate(files):
151 if idx == 0:
152 print("Parsing - " + file.name)
153 record = SeqIO.read(file.name, "genbank")
154 gbk_matches = []
155 for feature in record.features:
156 try:
157 if (
158 searchInput(
159 str(feature.qualifiers["product"]),
160 search_list=search_list,
161 )
162 or searchInput(
163 str(feature.qualifiers["note"]), search_list=search_list
164 )
165 or searchInput(
166 str(feature.qualifiers["dbxref"]),
167 search_list=search_list,
168 )
169 ):
170 gbk_matches.extend([str(feature)])
171 else:
172 continue
173 except KeyError:
174 continue
175 gbk_matches = list(set(gbk_matches))
176 else:
177 print("Parsing - " + file.name)
178 record = SeqIO.read(file.name, "genbank")
179 for feature in record.features:
180 try:
181 if (
182 searchInput(
183 str(feature.qualifiers["product"]),
184 search_list=search_list,
185 )
186 or searchInput(
187 str(feature.qualifiers["note"]), search_list=search_list
188 )
189 or searchInput(
190 str(feature.qualifiers["dbxref"]),
191 search_list=search_list,
192 )
193 ):
194 gbk_matches.extend([str(feature)])
195 else:
196 continue
197 except KeyError:
198 continue
199 gbk_matches = list(set(gbk_matches))
200 gbk_matches.sort()
201 return gbk_matches
202 else:
203 pass
204
205
206 def readFASTA(files, search_list):
207 if files:
208 for idx, file in enumerate(files):
209 if idx == 0:
210 print("Parsing - " + file.name)
211 record = SeqIO.parse(file.name, "fasta")
212 fa_matches = []
213 for feature in record:
214 fa_matches.extend(
215 searchInput(feature.description, search_list=search_list)
216 )
217 fa_matches = list(set(fa_matches))
218 else:
219 print("Parsing - " + file.name)
220 record = SeqIO.parse(file.name, "fasta")
221 for feature in record:
222 fa_matches.extend(
223 searchInput(feature.description, search_list=search_list)
224 )
225 fa_matches = list(set(fa_matches))
226 fa_matches.sort()
227 return fa_matches
228 else:
229 pass
230
231
232 def readBLAST(files, search_list):
233 if files:
234 for idx, file in enumerate(files):
235 if idx == 0:
236 print("Parsing - " + file.name)
237 blast_records = NCBIXML.parse(open(file.name))
238 blast_matches = []
239 for blast_record in blast_records:
240 for desc in blast_record.descriptions:
241 pretty = prettifyXML(str(desc))
242 for each_ret in pretty:
243 blast_matches.extend(
244 searchInput(
245 each_ret,
246 search_list=search_list,
247 blast=True,
248 q_id=blast_record.query,
249 )
250 )
251 blast_matches = list(set(blast_matches))
252 else:
253 print("Parsing - " + file.name)
254 blast_records = NCBIXML.parse(open(file.name))
255 for blast_record in blast_records:
256 for desc in blast_record.descriptions:
257 pretty = prettifyXML(str(desc))
258 blast_matches.extend(
259 searchInput(
260 each_ret,
261 search_list=search_list,
262 blast=True,
263 q_id=blast_record.query,
264 )
265 )
266 blast_matches = list(set(blast_matches))
267 blast_matches.sort()
268 return blast_matches
269 else:
270 pass
271
272
273 ######## SEARCH FILE FUNCTIONS
274 def searchInput(input, search_list, blast=False, q_id=None):
275 """Takes an input search string, and returns uniques of passing"""
276 output = []
277 for search_term in search_list:
278 if blast:
279 if re.search(re.escape(search_term), input):
280 add_query = (
281 "QueryID: "
282 + str(q_id)
283 + "\nSearchQuery: "
284 + search_term
285 + "\nMatch: "
286 + input
287 + "\n"
288 )
289 output.extend([add_query])
290 else:
291 continue
292 # print(search_term)
293 # st = r"\b"+search_term+r"\b"
294 else:
295 if re.search(re.escape(search_term), input):
296 # print(search_term+" -> was found")
297 output.extend([input])
298 else:
299 continue
300 return list(set(output))
301
302
303 ######## prettify-XML function
304 def prettifyXML(input):
305 """prettifies a string input from a BLAST-xml"""
306 s = input
307 split = s.split(">")
308
309 return split
310
311
312 ########## Output File Writer
313 def writeResults(gffs, gbks, fas, blasts, outName="termHits.txt"):
314 """Takes an input list for each parameter, and writes each result to the output file"""
315
316 with open(outName.name, "w+") as out_file:
317 if gffs:
318
319 out_file.writelines(
320 "\n==================== GFF3 Term Hits ====================\n\n"
321 )
322 for gff_hits in gffs:
323 out_file.writelines(gff_hits + "\n")
324 else:
325 gffs = []
326 if gbks:
327 out_file.writelines(
328 "\n==================== GBK Term Hits ====================\n\n"
329 )
330 for gbk_hits in gbks:
331 out_file.writelines(gbk_hits + "\n")
332 else:
333 gbks = []
334 if fas:
335
336 out_file.writelines(
337 "\n==================== FASTA Term Hits ====================\n\n"
338 )
339 for fa_hits in fas:
340 out_file.writelines(fa_hits + "\n")
341 else:
342 fas = []
343 if blasts:
344
345 out_file.writelines(
346 "\n==================== BLAST Term Hits ====================\n\n"
347 )
348 for blast_hits in blasts:
349 out_file.writelines(blast_hits + "\n")
350 else:
351 blasts = []
352 if len(gffs) or len(gbks) or len(fas) or len(blasts):
353 print("Terms Found")
354 else:
355 out_file.writelines("No query matches, try again with new terms!")
356 print("No query matches, try again with new terms!")
357
358
359 def write_gff3(gffs, outName="proxHits.gff3"):
360 """writes output to gff3 file for prox2lysis pipeline"""
361
362 with open(outName.name, "w+") as out_file:
363 out_file.writelines("##gff-version 3\n")
364 if gffs:
365 for gff_hits in gffs:
366 out_file.writelines(gff_hits + "\n")
367 else:
368 # raise Exception("No terms were found from query set")
369 out_file.writelines("##No terms were found from query set\n")
370
371
372 if __name__ == "__main__":
373 print(os.getcwd())
374 parser = argparse.ArgumentParser(
375 description="Uses a selection of terms to query an input file for matching cases"
376 )
377 parser.add_argument(
378 "--dbaseTerms", nargs="*", help="dbase terms to search"
379 ) # will be a select option, based on KEY within the JSON dbase
380 parser.add_argument(
381 "--custom_txt",
382 nargs="*",
383 help="custom user input terms, if using Galaxy, terms will be __cn__ sep, otherwise by space",
384 )
385 parser.add_argument(
386 "--custom_file",
387 type=argparse.FileType("r"),
388 help="custom new line separated search term file",
389 )
390 parser.add_argument(
391 "--gff3_files",
392 type=argparse.FileType("r"),
393 nargs="*",
394 action="append",
395 help="GFF3 File(s), if multiple files, use another flag",
396 )
397 parser.add_argument(
398 "--gbk_files",
399 type=argparse.FileType("r"),
400 nargs="*",
401 action="append",
402 help="GBK File(s), if multiple files, use another flag",
403 )
404 parser.add_argument(
405 "--fa_files",
406 type=argparse.FileType("r"),
407 nargs="*",
408 action="append",
409 help="FASTA File(s), if multiple files, use another flag",
410 )
411 parser.add_argument(
412 "--blast_files",
413 type=argparse.FileType("r"),
414 nargs="*",
415 action="append",
416 help="BLAST.xml File(s), if multiple files, use another flag",
417 )
418 parser.add_argument(
419 "--output", type=argparse.FileType("w+"), default="termHits.txt"
420 )
421 parser.add_argument(
422 "--prox", action="store_true", help="Use when running the prox2lysis pipeline"
423 )
424 args = parser.parse_args()
425
426 ############ STEP I
427 ##### Determine user's terms to query
428 dbase_terms = dbaseTerms(terms=args.dbaseTerms, galaxy=True)
429 user_terms = userTerms(file=args.custom_file, text=args.custom_txt)
430 glued_terms = glueTerms(dbase_terms=dbase_terms, user_terms=user_terms)
431
432 ############ STEP II
433 ##### Create list with matches
434 files = glueFiles(
435 gff=args.gff3_files,
436 gbk=args.gbk_files,
437 fa=args.fa_files,
438 blast=args.blast_files,
439 )
440 gffs = readGFF3(files=files[0], search_list=glued_terms)
441 gbks = readGBK(files=files[1], search_list=glued_terms)
442 fas = readFASTA(files=files[2], search_list=glued_terms)
443 blasts = readBLAST(files=files[3], search_list=glued_terms)
444
445 ############ STEP III
446 ##### Output results to a text file or gff3
447 if args.prox:
448 write_gff3(gffs, outName=args.output)
449 else:
450 writeResults(gffs, gbks, fas, blasts, outName=args.output)