Mercurial > repos > pieterlukasse > prims_metabolomics
changeset 21:19d8fd10248e
* Added interface to METEXP data store, including tool to fire queries in batch mode
* Improved quantification output files of MsClust, a.o. sorting
mass list based on intensity (last two columns of quantification
files)
* Added Molecular Mass calculation method
author | pieter.lukasse@wur.nl |
---|---|
date | Wed, 05 Mar 2014 17:20:11 +0100 |
parents | 24fb75fedee0 |
children | cd4f13119afa |
files | MsClust.jar README.rst combine_output.py export_to_metexp_tabular.py export_to_metexp_tabular.xml msclust.xml query_metexp.py query_metexp.xml rankfilterGCMS_tabular.xml static_resources/elements_and_masses.tab test/test_export_to_metexp_tabular.py test/test_query_metexp.py test/test_query_metexp_LARGE.py |
diffstat | 13 files changed, 826 insertions(+), 33 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Tue Feb 11 12:29:50 2014 +0100 +++ b/README.rst Wed Mar 05 17:20:11 2014 +0100 @@ -19,6 +19,11 @@ ============== ====================================================================== Date Changes -------------- ---------------------------------------------------------------------- +March 2014 * Added interface to METEXP data store, including tool to fire + queries in batch mode + * Improved quantification output files of MsClust, a.o. sorting + mass list based on intensity (last two columns of quantification + files) January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a single package (this package) * integration with METEXP software (data store for metabolomics
--- a/combine_output.py Tue Feb 11 12:29:50 2014 +0100 +++ b/combine_output.py Wed Mar 05 17:20:11 2014 +0100 @@ -155,12 +155,16 @@ @param data: dictionary containing merged dataset @param out_csv: output csv file ''' - header = ['Centrotype', + # Columns we don't repeat: + header_part1 = ['Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)', 'nr. Peaks', - 'R.T.', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ 'Name', 'FORMULA', 'Library', @@ -190,13 +194,21 @@ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") # Write headers - output_single_handle.writerow(header) - output_multi_handle.writerow(header * nhits) + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) # Combine all hits for each centrotype into one line line = [] for centrotype_idx in xrange(len(data)): + i = 0 for hit in data[centrotype_idx]: - line.extend(hit) + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) output_multi_handle.writerow(line) line = [] @@ -205,6 +217,17 @@ for hit in data[centrotype_idx]: output_single_handle.writerow(hit) +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result def main(): '''
--- a/export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100 +++ b/export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100 @@ -5,17 +5,18 @@ into a tabular file that can be uploaded to the MetExp database. RankFilter, CasLookup are already combined by combine_output.py so here we will use -this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust -quantification files are to be combined with combine_output.py result as well. +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. Extra calculations performed: - The column MW is also added here and is derived from the column FORMULA found - in combine_output.py result. + in RankFilter, CasLookup combined result. -So in total here we merge 3 files and calculate one new column. +So in total here we merge 2 files and calculate one new column. ''' - +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 import csv +import re import sys from collections import OrderedDict @@ -40,14 +41,15 @@ ONE_TO_ONE = 'one_to_one' N_TO_ONE = 'n_to_one' -def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, relation_type=ONE_TO_ONE): +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): ''' Merges data from both input dictionaries based on the link fields. This method will build up a new list containing the merged hits as the items. @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) ''' - # TODO test for correct input files -> same link_field values should be there (test at least number of unique link_field values): + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): # # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): # raise Exception('input files should have the same nr of key values ') @@ -64,17 +66,23 @@ # Get the indices for current link_field_set1_value in both data-structures for proper matching set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] - - + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") merged_hits = [] # Combine hits for hit in xrange(len(set1index)): # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its - # corresponding value in the rankfilter or caslookup tables; i.e. - # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute - # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # # It just ensures the entry is made available as a plain named array for easy access. rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) if relation_type == ONE_TO_ONE : @@ -83,7 +91,7 @@ # is N to 1: cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) - merged_hit = merge_function(rf_record, cl_record) + merged_hit = merge_function(rf_record, cl_record, metadata) merged_hits.append(merged_hit) merged.append(merged_hits) @@ -103,29 +111,62 @@ -def _merge_records(rank_caslookup_combi, msclust_quant_record): +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): ''' Combines single records from both the RankFilter+CasLookup combi file and from MsClust file @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) @param msclust_quant_record: msclust quantification + spectrum record ''' - i = 0 record = [] for column in rank_caslookup_combi: record.append(rank_caslookup_combi[column]) - i += 1 for column in msclust_quant_record: record.append(msclust_quant_record[column]) - i += 1 + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') return record - +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") -def _save_data(data, headers, nhits, out_csv): + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): ''' Writes tab-separated data to file @param data: dictionary containing merged dataset @@ -139,12 +180,35 @@ # Write headers output_single_handle.writerow(headers) - # Write one line for each centrotype - for centrotype_idx in xrange(len(data)): - for hit in data[centrotype_idx]: + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: output_single_handle.writerow(hit) +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + def main(): ''' Combine Output main function @@ -156,15 +220,27 @@ rankfilter_and_caslookup_combined_file = sys.argv[1] msclust_quantification_and_spectra_file = sys.argv[2] output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] # Read RankFilter and CasLookup output files rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', - msclust_quantification_and_spectra, 'centrotype', _compare_records, _merge_records, N_TO_ONE) - headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() - _save_data(merged, headers, nhits, output_csv) + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) if __name__ == '__main__':
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.xml Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,57 @@ +<tool id="export_to_metexp_tabular" + name="METEXP - Tabular file" + version="0.1.0"> + <description>Create tabular file for loading into METabolomics EXPlorer database</description> + <command interpreter="python"> + export_to_metexp_tabular.py $rankfilter_and_caslookup_combi $msclust_quant_file $output_result + $organism $tissue $experiment_name $user_name $column_type + </command> + <inputs> + <param format="tabular" name="rankfilter_and_caslookup_combi" type="data" label="RIQC-Combine RankFilter and CasLookup output" + help="Select the (multi) output file from the 'Combine RankFilter and CasLookup' tool"/> + <param format="tabular" name="msclust_quant_file" type="data" label="MusClust-quantification file output" + help="Select the output file from MsClust (centrotype, mic or sim) which also contain respective spectrum details"/> + + + <param name="organism" type="text" size="80" + label="Organism(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." /> + <param name="tissue" type="text" size="80" + label="Tissue(s) info" + help="Metadata information to accompany the results when stored in MetExp DB." /> + + <param name="experiment_name" type="text" size="80" + label="Experiment name/code" + help="Name or code to store the results under. This can help you find the results back in MetExpDB." /> + + <param name="user_name" type="text" size="80" + label="User name" + help="User name or code to store the results under. This can help you find the results back in MetExpDB." /> + + <param name="column_type" type="text" size="80" + label="Column type" + help="Column type to report with the results. This can help you find the results back in MetExpDB." /> + + </inputs> + <outputs> + <data format="tabular" label="${tool.name} on ${on_string}" name="output_result" /> + </outputs> + <help> +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + </help> +</tool>
--- a/msclust.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/msclust.xml Wed Mar 05 17:20:11 2014 +0100 @@ -1,4 +1,4 @@ -<tool name="MsClust" id="msclust2" version="2.0.2"> +<tool name="MsClust" id="msclust2" version="2.0.3"> <description>Extracts fragmentation spectra from aligned data</description> <!-- For remote debugging start you listener on port 8000 and use the following as command interpreter:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.py Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,273 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. +''' +import csv +import sys +import fileinput +import urllib2 +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + MetExp Database. + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass and formula: + cas_id = input_data_record[casid_col] + formula = input_data_record[formula_col] + molecular_mass = input_data_record[molecular_mass_col] + + # search for related records in MetExp: + data_found = None + if cas_id != "undef": + # 1- search for other experiments where this CAS id has been found: + query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "CAS" + if data_found == None: + # 2- search for other experiments where this FORMULA has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "FORMULA" + if data_found == None: + # 3- search for other experiments where this MM has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - Experiment details where hits have been found : + 'organism', 'tissue','experiment_name','user_name','column_type' + - Link that executes same query + + ''' + # set() makes a unique list: + organism_set = [] + tissue_set = [] + experiment_name_set = [] + user_name_set = [] + column_type_set = [] + cas_nr_set = [] + + if 'organism' in data_found: + organism_set = set(data_found['organism']) + if 'tissue' in data_found: + tissue_set = set(data_found['tissue']) + if 'experiment_name' in data_found: + experiment_name_set = set(data_found['experiment_name']) + if 'user_name' in data_found: + user_name_set = set(data_found['user_name']) + if 'column_type' in data_found: + column_type_set = set(data_found['column_type']) + if 'CAS' in data_found: + cas_nr_set = set(data_found['CAS']) + + + result = [data_type_found, + _to_xsv(organism_set), + _to_xsv(tissue_set), + _to_xsv(experiment_name_set), + _to_xsv(user_name_set), + _to_xsv(column_type_set), + _to_xsv(cas_nr_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_metexp_URL(metexp_dblink_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(metexp_dblink_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + MetExp Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass + and one for the formula of the respective identification. These two columns are then + used to query against MetExp Database. + ''' + input_file = sys.argv[1] + casid_col = sys.argv[2] + formula_col = sys.argv[3] + molecular_mass_col = sys.argv[4] + metexp_dblink_file = sys.argv[5] + separation_method = sys.argv[6] + output_result = sys.argv[7] + + # Parse metexp_dblink_file to find the URL to the MetExp service: + metexp_dblink = _get_metexp_URL(metexp_dblink_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against MetExp DB : + enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) + headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', + 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] + + _save_data(enriched_data, headers, output_result) + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.xml Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,67 @@ +<tool id="query_metexp" + name="METEXP - Query Database " + version="0.1.0"> + <description>Query a set of identifications against the METabolomics EXPlorer database</description> + <command interpreter="python"> + query_metexp.py + $input_file + $casid_col + $formula_col + $molecular_mass_col + "$metexp_dblink_file" + $separation_method + $output_result + </command> + <inputs> + + <param name="input_file" format="tabular" type="data" + label="Input file" + help="Select a tabular file containing the entries to be queried/verified in the MetExp DB"/> + + <param name="casid_col" type="text" size="50" + label="CAS ID column name" + value="CAS" + help="Name of the column containing the CAS code information (in the given input file)" /> + <param name="formula_col" type="text" size="50" + label="Formula ID column name" + value="FORMULA" + help="Name of the column containing the formula information (in the given input file)" /> + <param name="molecular_mass_col" type="text" size="50" + label="Molecular mass column name" + value="MM" + help="Name of the column containing the molecular mass information (in the given input file)" /> + + <param name="metexp_dblink_file" type="select" label="MetExp DB to query" + help="Select the MetExp Database/backend which should be queried" + dynamic_options='get_directory_files("tool-data/shared/PRIMS-metabolomics/MetExp_Databases")'/> + + <param name="separation_method" type="select" label="Data type to query"> + <option value="GC" selected="True">GC</option> + <option value="LC">LC</option> + </param> + + </inputs> + <outputs> + <data name="output_result" format="tabular" label="${tool.name} on ${on_string}" /> + </outputs> + <code file="match_library.py" /> <!-- file containing get_directory_files function used above--> + <help> +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + </help> +</tool>
--- a/rankfilterGCMS_tabular.xml Tue Feb 11 12:29:50 2014 +0100 +++ b/rankfilterGCMS_tabular.xml Wed Mar 05 17:20:11 2014 +0100 @@ -3,7 +3,7 @@ <command interpreter="python">rankfilter_GCMS/rankfilter.py $input_file</command> <inputs> <param format="tabular" name="sample" type="data" label="Sample File" - help="Converted PDF file in tabular format" /> + help="Select a tab delimited NIST metabolite identifications file (converted from PDF)" /> <!-- question: is this calibration file not column specific as it includes RT info?? --> <!-- this one should be input file for now:<param name="calibration" type="select" label="Calibration File" help="Calibration file with reference masses (e.g. alkanes) with their RT and RI values"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/static_resources/elements_and_masses.tab Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1
--- a/test/test_export_to_metexp_tabular.py Tue Feb 11 12:29:50 2014 +0100 +++ b/test/test_export_to_metexp_tabular.py Wed Mar 05 17:20:11 2014 +0100 @@ -10,6 +10,27 @@ class IntegrationTest(unittest.TestCase): + def test_MM_calculations(self): + ''' + test the implemented method for MM calculations for + given chemical formulas + ''' + export_to_metexp_tabular.init_elements_and_masses_map() + + formula = "C8H18O3" + # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertEqual(162.26, result) + + formula = "CH2O3Fe2Ni" + # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertAlmostEqual(232.44, result, 2) + + + + + def test_combine_output_simple(self): ''' comment me @@ -28,7 +49,13 @@ sys.argv = ['test', rankfilter_and_caslookup_combined_file, msclust_quantification_and_spectra_file, - output_csv] + output_csv, + 'tomato', + 'leafs', + 'test experiment', + 'pieter', + 'DB5 column'] + # Execute main function with arguments provided through sys.argv export_to_metexp_tabular.main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp.py Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,82 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + # TODO - asserts (base them on DB being filled with test data form metexp unit test for upload method) + # PA + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp_LARGE.py Wed Mar 05 17:20:11 2014 +0100 @@ -0,0 +1,79 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_large(self): + ''' + Simple test, but on larger set, last test executed in 28s + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read()