Mercurial > repos > damion > blast_reporting
comparison common.py @ 0:7db7ecc78ad6 draft
Uploaded
author | damion |
---|---|
date | Mon, 02 Mar 2015 20:46:00 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7db7ecc78ad6 |
---|---|
1 import os.path | |
2 import sys | |
3 import re | |
4 import optparse | |
5 import subprocess | |
6 from shutil import move | |
7 import csv | |
8 | |
9 re_default_query_id = re.compile("^Query_\d+$") | |
10 #assert re_default_query_id.match("Query_101") | |
11 #assert not re_default_query_id.match("Query_101a") | |
12 #assert not re_default_query_id.match("MyQuery_101") | |
13 re_default_subject_id = re.compile("^(Subject_|gnl\|BL_ORD_ID\|)\d+$") #requires some kind of numeric id | |
14 #assert self.re_default_subject_id.match("gnl|BL_ORD_ID|221") | |
15 #assert re_default_subject_id.match("Subject_1") | |
16 #assert not re_default_subject_id.match("Subject_") | |
17 #assert not re_default_subject_id.match("Subject_12a") | |
18 #assert not re_default_subject_id.match("TheSubject_1") | |
19 # Spot sequence ids that have accession ids in them | |
20 re_default_ncbi_id = re.compile("^gi\|\d+\|[a-z]+\|[a-zA-Z0-9_]+(\.\d+)?\|") | |
21 re_default_ref_id = re.compile("^ref\|[a-zA-Z0-9_]+\|[a-zA-Z0-9_]+(\.\d+)?\|") | |
22 | |
23 | |
24 def stop_err( msg ): | |
25 sys.stderr.write("%s\n" % msg) | |
26 sys.exit(1) | |
27 | |
28 class MyParser(optparse.OptionParser): | |
29 """ | |
30 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output | |
31 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns. | |
32 """ | |
33 def format_epilog(self, formatter): | |
34 return self.epilog | |
35 | |
36 | |
37 | |
38 ## *********************************** FieldFilter **************************** | |
39 class FieldFilter(object): | |
40 | |
41 def __init__(self, tagGroup, options): | |
42 """ Creates dicitionary of fields that are to be filtered, and array of comparators and their values. | |
43 Numeric filters have a single numeric value | |
44 Each text filter is a string of phrases separated by "|" | |
45 | |
46 e.g. filters = "pident: > 97,score: > 37,sallseqdescr includes what | have|you" | |
47 | |
48 @param filters string e.g. "[ [field name]: [comparator] [value],[[comparator] [value],]* ]* | |
49 @result .dict dictionary contains field name keys and arrays of [comparator, filterValue] | |
50 | |
51 """ | |
52 self.dict = {} | |
53 self.comparators = { | |
54 '==': lambda x,y: float(x) == float(y), | |
55 '!=': lambda x,y: float(x) != float(y), | |
56 'gt': lambda x,y: float(x) > float(y), | |
57 'gte': lambda x,y: float(x) >= float(y), | |
58 'lt': lambda x,y: float(x) < float(y), | |
59 'lte': lambda x,y: float(x) <= float(y), | |
60 'includes': self.includesPhrase, | |
61 'excludes': self.excludesPhrase | |
62 } | |
63 self.matches = {} | |
64 self.drop_redundant_hits = options.drop_redundant_hits | |
65 | |
66 | |
67 | |
68 if options.filters != None: | |
69 cleaned_filters = [] | |
70 for colPtr, filterParam in enumerate(options.filters.strip().strip(';').split(';')): | |
71 filterSpec = filterParam.strip().split(":") | |
72 filterField = filterSpec[0].strip() | |
73 if len(filterField) > 0: | |
74 if filterField in self.dict: | |
75 stop_err("Filter field listed twice: \"" + filterField + "\". Please move constraints up to first use of field!") | |
76 field_name = cleanField(tagGroup.columns_in, filterField, 'Invalid field for filtering eh') | |
77 if len(filterSpec) > 1: #we have start of filter field defn. "[field]:[crit]+," | |
78 | |
79 self.dict[field_name] = [] #new entry for filter field | |
80 | |
81 for filterParam in filterSpec[1].strip().strip(',').split(','): | |
82 filterSpec2 = filterParam.strip().split(' ') | |
83 comparator = filterSpec2[0] | |
84 if not comparator in self.comparators: | |
85 stop_err("Invalid comparator for field filter: \"" + comparator + "\"") | |
86 if len(filterSpec2) < 2: | |
87 stop_err("Missing value for field comparator: \"" + comparator + "\"") | |
88 | |
89 #For text search, values are trimmed array of phrases | |
90 if comparator in ['includes','excludes']: | |
91 filterValue = list(map(str.strip, ' '.join(filterSpec2[1:]).split('|'))) | |
92 filterValue = filter(None, filterValue) | |
93 else: | |
94 filterValue = filterSpec2[1] | |
95 | |
96 self.dict[field_name].append([comparator, filterValue]) | |
97 | |
98 cleaned_filters.append(field_name + ':' + filterSpec[1]) | |
99 | |
100 options.filters = ';'.join(cleaned_filters) | |
101 # Adjust filter expression fieldnames. | |
102 words = {'gt':'>', 'gte':'>=', 'lt':'<', 'lte':'<=',',':'',':':' '} | |
103 options.filters_HTML = word_replace_all(options.filters, words) | |
104 words = {'gt':'>', 'gte':'>=', 'lt':'<', 'lte':'<=',',':'',':':' '} | |
105 options.filters = word_replace_all(options.filters, words) | |
106 | |
107 else: | |
108 options.filters = None | |
109 options.filters_HTML = '' | |
110 | |
111 def __str__(self): | |
112 return "label: %s dict: %s" % (self.label, str(self.dict)) | |
113 | |
114 def includesPhrase(self, source, filter_phrases): | |
115 """ Search for any words/phrases (separated by commas) in commastring in source string | |
116 @param source string Words separated by whitespace | |
117 @param filter_phrases array of phrases | |
118 | |
119 """ | |
120 return any(x in source for x in filter_phrases) | |
121 | |
122 def excludesPhrase(self, source, commastring): | |
123 return not self.includesPhrase(source, commastring) | |
124 | |
125 def process(self, record): | |
126 """ For given record (an object) cycle through filters to see if any of record's attributes fail filter conditions. | |
127 | |
128 FUTURE: MAKE GENERIC SO PASSED record field function for unique test.??? | |
129 | |
130 @uses self.dict | |
131 @uses self.drop_redundant_hits | |
132 @uses self.matches | |
133 @param record object An object containing field & values read from a <hit> line. | |
134 @return boolean True if all filter criteria succeed, false otherwise | |
135 | |
136 """ | |
137 | |
138 # Block out repeated hits | |
139 # THIS ASSUMES BLASTn XML file is listing BEST HIT FIRST. Only appropriate for searching for single hits within a reference sequence. | |
140 if self.drop_redundant_hits == True: | |
141 # parsing succession id from e.g. gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus] | |
142 #acc = str(record.sseqid.split('|')[3:4]).strip() | |
143 key = record.qseqid + '-' + record.accessionid #acc | |
144 if key in self.matches: | |
145 return False | |
146 self.matches[key] = True | |
147 | |
148 for key, constraints in self.dict.items(): | |
149 try: # The .loc table of fields has fieldnames without leading _ underscore. | |
150 # Such fields are assumed to be added by code; | |
151 # Leading underscore fields are raw values read from XML file directly. | |
152 # Our filter names don't have underscore, but we see if underscore field exists if normal attr check fails | |
153 value = getattr(record, key) | |
154 for ptr, constraint in enumerate(constraints): | |
155 comparator = constraint[0] | |
156 userValue = constraint[1] | |
157 # print "constraint " + str(value) + comparator + str(userValue) + " -> " + \ | |
158 # str (self.comparators[comparator](value, userValue) ) | |
159 if not self.comparators[comparator](value, userValue): | |
160 return False #failed a constraint | |
161 except AttributeError: | |
162 print 'A filter on field [' + key + '] was requested, but this field does not exist.' | |
163 raise KeyError | |
164 | |
165 return True | |
166 | |
167 | |
168 | |
169 class FieldSpec(object): | |
170 | |
171 def __init__(self, file_path, columns_in = []): | |
172 """ READ FIELD SPECIFICATIONS of a particular galaxy tool form/process from a .loc 'tabular data' file | |
173 | |
174 Example blast_reporting_fields.tab file | |
175 | |
176 #value type subtype sort filter default min max choose name | |
177 # Remember to edit tool_data_table_conf.xml for column spec! | |
178 qseqid numeric int 1 1 1 Query Seq-id | |
179 sseqid numeric int 1 1 1 Subject Seq-id | |
180 pident numeric float 1 1 97 90 100 1 Percentage of identical matches | |
181 | |
182 - value is name of field: alphanumeric strings only. | |
183 - type is 'text' or 'numeric' or 'bin' | |
184 - subtype where applicable, indicates further validation function | |
185 - sort indicates if field should be provided in sort menu | |
186 - filter indicates if field should be in menu of fields that can be filtered | |
187 - default is default value field should have if drawn on form | |
188 - min is minimum range of field | |
189 - max is maximum range of field | |
190 - choose indicates if field can be chosen for an output column (some are mandatory / some are to be avoided?) | |
191 - name is textual name of field as it should appear on pulldown menus | |
192 | |
193 @param file_path string full name and path of .loc file containing pertinent field names and their specifications. | |
194 @result .dict dictionary | |
195 | |
196 """ | |
197 self.dict = {} | |
198 self.columns_in = columns_in | |
199 | |
200 with open(file_path, 'rb') as f: | |
201 reader = csv.DictReader(f, delimiter='\t') #1st row read as field name header by default | |
202 try: | |
203 for row in reader: | |
204 myKey = row['#value'] | |
205 # Some lines begin with '#' for value. Ignore them | |
206 # Also, reader has read column names from first row; "#value" is name of first column | |
207 if not myKey[0] == '#': # 1st character is not a hash | |
208 row.pop("#value", None) | |
209 self.dict[myKey] = row | |
210 # self.dict[myKey]['value']=row['#value'] # If we need this ever? | |
211 | |
212 | |
213 except csv.Error as e: | |
214 stop_err('file %s, line %d: %s' % (filename, reader.line_num, e)) | |
215 | |
216 | |
217 def initColumns(self, columns_out, custom_columns): | |
218 """ | |
219 # Augment columns with fieldSpec label and some sorting defaults. | |
220 # Default sorts: qseqid is marked as sorted asc; score as sorted desc. | |
221 # No need to move sorted fields around. | |
222 # This basically creates spec to generate tab-delimited file. | |
223 # The only other calculation done for that is the row_limit cut, if any. | |
224 """ | |
225 column_spec = list(columns_out) | |
226 for (i, spec) in enumerate(column_spec): | |
227 spec_field = spec.lstrip("_") | |
228 | |
229 if spec_field == 'qseqid': | |
230 sort = 'asc' | |
231 group = 'section' | |
232 elif spec_field == 'score': | |
233 sort = 'desc' | |
234 group = 'column' | |
235 else: | |
236 sort = '' | |
237 group = 'column' | |
238 | |
239 field = { | |
240 'field': spec, | |
241 'group': group, | |
242 'sort': sort, | |
243 'label': self.getAttribute(spec_field, 'short_name'), | |
244 'type': self.getAttribute(spec_field, 'type') | |
245 } | |
246 column_spec[i] = field | |
247 | |
248 """ | |
249 # For the HTML (OR XML) report we allow users to specify columns of data to represent sections of the report or table sections. | |
250 # Selected columns either enhance an existing column's info, or add a new column. | |
251 # If a selected column is sorted, it is inserted/moved to after last SORTED column in data. | |
252 # In other words, primary/secondary etc sorting is preserved. | |
253 """ | |
254 if custom_columns != None: | |
255 | |
256 | |
257 custom_spec = [x.strip() for x in custom_columns.split(';')] | |
258 for spec in custom_spec: | |
259 params = [i.strip() for i in spec.rstrip(":").split(":")] | |
260 parlen = len(params) | |
261 if parlen > 0 and params[0] != '': | |
262 field_name = cleanField(self.columns_in, params[0]) # Halts if it finds a field mismatch | |
263 | |
264 group = 'column' | |
265 sort = '' | |
266 if parlen > 1 and params[1] in ['column','hidden','table','section']: group = params[1] | |
267 if parlen > 2 and params[2] in ['asc','desc']: sort = params[2] | |
268 | |
269 # Enforce sort on section and table items.... | |
270 | |
271 # All self.column_spec have a fieldspec entry. Get default label from there. | |
272 # HOW TO HANDLE CALCULATED FIELD LABELS? ENSURE THEY HAVE ENTRIES? | |
273 spec_field = field_name.lstrip("_") | |
274 label = self.getAttribute(spec_field, 'short_name') | |
275 if parlen > 3: label = params[3] | |
276 | |
277 field = { | |
278 'field': field_name, | |
279 'group': group, | |
280 'sort': sort, | |
281 'label': label, | |
282 'type': self.getAttribute(spec_field, 'type') | |
283 } | |
284 | |
285 # If field is a 'section' move it right after last existing 'section' (if not matched) | |
286 # if its a 'table' move it after last existing 'table' (if not matched) | |
287 # otherwise append to column list.(if not matched) | |
288 | |
289 found = False # if found== true, rest of loop looks for existing mention of field, and removes it. | |
290 for (ptr, target) in enumerate(column_spec): | |
291 | |
292 found_name = spec_field == target['field'].lstrip("_") | |
293 if (found == True): | |
294 if (found_name): # Found duplicate name | |
295 del column_spec[ptr] | |
296 break | |
297 elif (found_name): | |
298 found = True | |
299 column_spec[ptr] = field # Overwrite spec. | |
300 break | |
301 | |
302 elif (field['group'] == 'section'): | |
303 if (target['group'] != 'section'): # time to insert section | |
304 found = True | |
305 column_spec.insert(ptr, field) | |
306 | |
307 elif (field['group'] == 'table'): | |
308 if (target['group'] == 'column' or target['group'] == 'hidden'): | |
309 found = True | |
310 column_spec.insert(ptr, field) | |
311 | |
312 if found == False: # didn't find place for field above. | |
313 column_spec.append(field) | |
314 # print ("col spec: " + str(column_spec)) | |
315 | |
316 return column_spec | |
317 | |
318 | |
319 | |
320 def getAttribute(self, fieldName, attribute): | |
321 """ Retrieve attribute of a given field | |
322 | |
323 @param fieldName string | |
324 @param attribute string | |
325 @return string value of attribute | |
326 | |
327 """ | |
328 return self.dict[fieldName][attribute] | |
329 | |
330 | |
331 def word_replace_all(text, dictionary): | |
332 textArray = re.split('(\W+)', text) #Workaround: split() function is not allowing words next to punctuation. | |
333 for ptr,w in enumerate(textArray): | |
334 if w in dictionary: textArray[ptr] = dictionary[w] | |
335 return ''.join(textArray) | |
336 | |
337 | |
338 def cleanField(columns_in, field_name, msg = 'Not a valid field name'): | |
339 | |
340 if not field_name.replace('_','').isalnum(): | |
341 stop_err(msg + ': [' + field_name+']') | |
342 if field_name in columns_in: | |
343 clean = field_name | |
344 elif '_' + field_name in columns_in: #passed from source file | |
345 clean = '_' + field_name | |
346 else: #column not found here | |
347 stop_err(msg + ':'+ field_name + '- no such field') | |
348 return clean | |
349 | |
350 | |
351 def fileSort (out_file, fields): | |
352 """ | |
353 fileSort() uses linux "sort" to handle possibility of giant file sizes. | |
354 List of fields to sort on delivered in options.sorting string as: | |
355 | |
356 [{name:[field_name],order:[asc|desc],label:[label]},{name ... }] etc. | |
357 | |
358 Match "sorts" fields to columns to produce -k[col],[col] parameters that start and end sorting | |
359 Note that sort takes in columns with primary listed first, then secondary etc. | |
360 Note that file to be sorted can't have 1st line column headers. | |
361 | |
362 sort attributes: | |
363 | |
364 -f ignore case; | |
365 -r reverse (i.e. descending)Good. | |
366 -n numeric | |
367 -k[start col],[end col] range of text that sort will be performed on | |
368 -s stabilize sort : "If checked, this will stabilize sort by disabling its last-resort comparison so that lines in which all fields compare equal are left in their original relative order." Note, this might not be available on all linux flavours? | |
369 -V sorts numbers within text - if number is leading then field essentially treated as numeric. This means we don't have to specify -n for numeric fields in particular | |
370 | |
371 Note: some attention may need to be given to locale settings for command line sort | |
372 May need to set export LC_ALL=C or export LANG=C to ensure same results on all systems | |
373 | |
374 @param out_file string File path of file to resort | |
375 @param sorts string Comma-separated list of fields to sort, includes ascending/descending 2nd term;each field validated as an alphanumeric word + underscores. | |
376 @param prelim_columns dictionary of files column header names | |
377 """ | |
378 | |
379 sortparam = [] | |
380 for colPtr, field in enumerate(fields): | |
381 if field['sort']: | |
382 field_name = field['field'] | |
383 if not field_name.replace('_','').isalnum(): | |
384 stop_err("Invalid field to sort on: " + field) | |
385 | |
386 #print "sort term:" + field + ":" + str(prelim_columns) | |
387 ordering = '' if field['sort'] == "asc" else 'r' | |
388 column = str(colPtr+1) | |
389 # V sorts numbers AND text (check server's version of sort | |
390 sortparam.append('-k' + column + 'V' + ordering + ',' + column) | |
391 | |
392 if len(sortparam) > 0: | |
393 args = ['sort','-s','-f','-V','-t\t'] + sortparam + ['-o' + out_file, out_file] | |
394 sort_a = subprocess.call(args) | |
395 | |
396 | |
397 | |
398 def fileTabular (in_file, tagGroup, options): | |
399 """Produces tabular report format. Takes in tabular data + metainformation about that file, and iterates through rows. Not a query-based approach. | |
400 It trims off the sort-only columns (prelim - final), | |
401 It optionally adds column label header. (not done in fileSort() because it gets mixed into sort there.) | |
402 NOTE: RUN THIS AFTER fileHTML() BECAUSE IT MAY TRIM FIELDS THAT HTML REPORT NEEDS | |
403 | |
404 @param in_file string Full file path | |
405 @param tagGroup object Includes prelim_columns, final_columns | |
406 @param options object Includes label_flag and row_limit | |
407 | |
408 """ | |
409 fp_in = open(in_file, "rb") | |
410 fp_out = open(in_file + '.tmp', 'wb') | |
411 | |
412 try: | |
413 | |
414 reader = csv.reader(fp_in, delimiter="\t") | |
415 writer = csv.writer(fp_out, delimiter="\t") | |
416 | |
417 # WRITE TABULAR HEADER | |
418 if options.column_labels: # options.column_labels in ['name','field']: | |
419 if options.column_labels == 'label': | |
420 tabHeader = [field['label'] for field in tagGroup.columns] | |
421 else: | |
422 # Tabular data header: strip leading underscores off of any labels... | |
423 tabHeader = [field['field'].lstrip('_') for field in tagGroup.columns] | |
424 | |
425 writer.writerow(tabHeader) | |
426 | |
427 for row in reader: | |
428 | |
429 rowdata=[] | |
430 for (idx, field) in enumerate(tagGroup.columns): # Exclude hidden columns here? | |
431 rowdata.append(row[idx]) | |
432 writer.writerow(rowdata) | |
433 | |
434 move(in_file + '.tmp', in_file) # Overwrites in_file | |
435 | |
436 except IOError as e: | |
437 print 'Operation failed: %s' % e.strerror | |
438 | |
439 fp_in.close() | |
440 fp_out.close() | |
441 | |
442 | |
443 | |
444 def fileSelections (in_file, selection_file, tagGroup, options): | |
445 """ Produces selection report format. | |
446 For selection file we need: qseqid, qseq, sseqid, sseq, and # | |
447 | |
448 @param in_file string Full file path | |
449 @param tagGroup object Includes prelim_columns, final_columns | |
450 @param options object Includes label_flag and row_limit | |
451 | |
452 """ | |
453 fp_in = open(in_file, "rb") | |
454 | |
455 if selection_file != 'None': | |
456 fp_out = open(selection_file, 'w') | |
457 | |
458 | |
459 try: | |
460 | |
461 reader = csv.reader(fp_in, delimiter="\t") | |
462 writer = csv.writer(fp_out, delimiter="\t") | |
463 | |
464 for (idx, field) in enumerate(tagGroup.columns): | |
465 fieldname = field['field'] | |
466 if fieldname == 'qseqid': qseqid_col = idx | |
467 elif fieldname == '_qseq': qseq_col = idx | |
468 elif fieldname == 'sseqid': sseqid_col = idx | |
469 elif fieldname == '_sseq': sseq_col = idx | |
470 # else: stop_err("You : " + field) | |
471 | |
472 selectrow_count = 0 | |
473 grouping = -1 | |
474 old_section = '' | |
475 for row in reader: | |
476 | |
477 selectrow_count +=1 | |
478 if row[qseqid_col] != old_section: | |
479 old_section = row[qseqid_col] | |
480 grouping +=1 | |
481 writer.writerow([row[qseqid_col], row[qseq_col], grouping, selectrow_count]) | |
482 selectrow_count +=1 | |
483 | |
484 writer.writerow([row[sseqid_col], row[sseq_col], grouping, selectrow_count]) | |
485 | |
486 | |
487 except IOError as e: | |
488 print 'Operation failed: %s' % e.strerror | |
489 | |
490 fp_in.close() | |
491 fp_out.close() | |
492 | |
493 def testSuite(test_ids, tests, output_dir): | |
494 | |
495 if test_ids == 'all': | |
496 test_ids = sorted(tests.keys()) | |
497 else: | |
498 test_ids = test_ids.split(',') | |
499 | |
500 for test_id in test_ids: | |
501 if test_id in tests: | |
502 test = tests[test_id] | |
503 test['base_dir'] = os.path.dirname(__file__) | |
504 # Each output file has to be prefixed with the output folder | |
505 test['tmp_output'] = (' ' + test['outputs']).replace(' ',' ' + output_dir) | |
506 # Note: output_dir output files don't get cleaned up after each test. Should they?! | |
507 params = '%(base_dir)s/blast_reporting.py %(base_dir)s/test-data/%(input)s%(tmp_output)s %(options)s' % test | |
508 print("Testing" + test_id + ': ' + params) | |
509 os.system(params) | |
510 for file in test['outputs'].split(' '): | |
511 #print(os.system('diff --suppress-common-lines ./test-data/%s %s%s' % (file, output_dir, file))) | |
512 f1 = open(test['base_dir'] + '/test-data/' + file) | |
513 f2 = open(output_dir + file) | |
514 import difflib #n=[number of context lines | |
515 diff = difflib.context_diff(f1.readlines(), f2.readlines(), lineterm='',n=0) | |
516 # One Galaxy issue: it doesn't convert entities when user downloads file. BUT IT DOES when generating directly to command line? | |
517 print '\nCompare ' + file | |
518 print '\n'.join(list(diff)) | |
519 | |
520 else: | |
521 stop_err("\nExpecting one or more test ids from " + str(sorted(tests.keys()))) | |
522 | |
523 stop_err("\nTest finished.") |