Mercurial > repos > eslerm > vkmz
comparison vkmz.py @ 6:35b984684450 draft
planemo upload for repository https://github.com/HegemanLab/VKMZ commit 5ef8d2b36eb35ff5aad5d5e9b78c38405fc95c1a
| author | eslerm |
|---|---|
| date | Tue, 10 Jul 2018 17:58:35 -0400 |
| parents | 04079c34452a |
| children |
comparison
equal
deleted
inserted
replaced
| 5:04079c34452a | 6:35b984684450 |
|---|---|
| 1 ''' | 1 import argparse |
| 2 based on the BMRB compound database which can be found at: | 2 import csv |
| 3 http://www.bmrb.wisc.edu/ftp/pub/bmrb/relational_tables/metabolomics/Chem_comp.csv | 3 import math |
| 4 ''' | |
| 5 | |
| 6 import re | 4 import re |
| 7 import argparse | |
| 8 import multiprocessing | |
| 9 from multiprocessing import Pool | |
| 10 import csv | |
| 11 | |
| 12 import numpy as np | |
| 13 import math | |
| 14 import pandas as pd | |
| 15 from plotly import __version__ | |
| 16 import plotly.offline as py | |
| 17 import plotly.graph_objs as go | |
| 18 | 5 |
| 19 parser = argparse.ArgumentParser() | 6 parser = argparse.ArgumentParser() |
| 20 inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type') | 7 inputSubparser = parser.add_subparsers(help='Select input type:', dest='input-type') |
| 21 parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.') | 8 parse_tsv = inputSubparser.add_parser('tsv', help='Use tabular data as input.') |
| 22 parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.') | 9 parse_tsv.add_argument('--input', '-i', required=True, help='Path to tabular file. Must include columns: sample ID, mz, polarity, intensity, & retention time.') |
| 23 parse_tsv.add_argument('--no-plot', '-np', action='store_true', help='Disable plot generation.') | |
| 24 parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.') | 10 parse_xcms = inputSubparser.add_parser('xcms', help='Use XCMS data as input.') |
| 25 parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS dataMatrix file.') | 11 parse_xcms.add_argument('--data-matrix', '-xd', required=True, nargs='?', type=str, help='Path to XCMS data matrix file.') |
| 26 parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sampleMetadata file.') | 12 parse_xcms.add_argument('--sample-metadata', '-xs', required=True, nargs='?', type=str, help='Path to XCMS sample metadata file.') |
| 27 parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variableMetadata file.') | 13 parse_xcms.add_argument('--variable-metadata', '-xv', required=True, nargs='?', type=str, help='Path to XCMS variable metadata file.') |
| 28 parse_xcms.add_argument('--no-plot', '-n', action='store_true', help='Disable plot generation.') | |
| 29 parse_plot = inputSubparser.add_parser('plot', help='Only plot data.') | |
| 30 parse_plot.add_argument('--input', '-i', required=True, nargs='?', type=str, help='Path to VKMZ generated tabular file.') | |
| 31 for inputSubparser in [parse_tsv, parse_xcms]: | 14 for inputSubparser in [parse_tsv, parse_xcms]: |
| 32 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') | 15 inputSubparser.add_argument('--output', '-o', nargs='?', type=str, required=True, help='Specify output file path.') |
| 33 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in PPM') | 16 inputSubparser.add_argument('--error', '-e', nargs='?', type=float, required=True, help='Mass error of mass spectrometer in parts-per-million.') |
| 34 inputSubparser.add_argument('--database', '-d', nargs='?', default='databases/bmrb-light.tsv', help='Select database.') | 17 inputSubparser.add_argument('--database', '-db', nargs='?', default='databases/bmrb-light.tsv', help='Select database of known formula masses.') |
| 35 inputSubparser.add_argument('--directory', nargs='?', default='', type=str, help='Define directory of tool.') | 18 inputSubparser.add_argument('--directory','-dir', nargs='?', default='', type=str, help='Define path of tool directory. Assumes relative path if unset.') |
| 36 inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode. Ignore variables in input file.') | 19 inputSubparser.add_argument('--polarity', '-p', choices=['positive','negative'], help='Force polarity mode to positive or negative. Overrides variables in input file.') |
| 37 inputSubparser.add_argument('--no-adjustment', '-na', action='store_true', help='Use flag to turn off polarity based mass adjustment. This flag should always be used if reprocessing data generated by VKMZ.') | 20 inputSubparser.add_argument('--neutral', '-n', action='store_true', help='Set neutral flag if masses in input data are neutral. No mass adjustmnet will be made.') |
| 38 inputSubparser.add_argument('--unique', '-u', action='store_true', help='Set flag to only output features which have a single match.') | 21 inputSubparser.add_argument('--unique', '-u', action='store_true', help='Set flag to remove features with multiple predictions.') |
| 39 inputSubparser.add_argument('--multiprocessing', '-m', action='store_true', help='Set flag to turn on multiprocessing.') | |
| 40 inputSubparser.add_argument('--plottype', '-t', nargs='?', default='scatter-2d', choices=['scatter-2d', 'scatter-3d'], help='Select plot type.') | |
| 41 inputSubparser.add_argument('--size', '-s', nargs='?', default=5, type=int, help='Set maxium size of plot symbols.') | |
| 42 inputSubparser.add_argument('--size-algorithm', '-a', nargs='?', default=0, type=int, choices=[0,1], help="Symbol size algorithm selector. Algorithm 0 sets all symbols to the maxium size. Algorithm 2 determines a features symbol size by it's log intensity.") | |
| 43 args = parser.parse_args() | 22 args = parser.parse_args() |
| 44 | 23 |
| 45 vkInputType = getattr(args, "input-type") | 24 # store input constants |
| 46 | 25 INPUT_TYPE = getattr(args, "input-type") |
| 47 # read inputs, arguments and define globals | 26 POLARITY = getattr(args, "polarity") |
| 48 vkError = getattr(args, "error") | 27 |
| 49 | |
| 50 vkPolarity = getattr(args, "polarity") | |
| 51 | |
| 52 vkUnique = getattr(args, "unique") | |
| 53 | |
| 54 vkMultiprocessing = getattr(args, "multiprocessing") | |
| 55 | |
| 56 vkNoAdjustment = getattr(args, "no_adjustment") | |
| 57 | |
| 58 vkDatabaseFile = getattr(args, "database") | |
| 59 vkDirectory = getattr(args, "directory") | |
| 60 | |
| 61 vkMass = [] | |
| 62 vkFormula = [] | |
| 63 try: | |
| 64 with open(vkDirectory+vkDatabaseFile, 'r') as tsv: | |
| 65 next(tsv) # skip first row | |
| 66 for row in tsv: | |
| 67 mass, formula = row.split() | |
| 68 vkMass.append(mass) | |
| 69 vkFormula.append(formula) | |
| 70 except ValueError: | |
| 71 print('The %s database could not be loaded.' % vkDatabaseFile) | |
| 72 vkMaxIndex = len(vkMass)-1 | |
| 73 | |
| 74 vkOutput = getattr(args, "output") | |
| 75 | |
| 76 vkPlotType = getattr(args, 'plottype') | |
| 77 | |
| 78 vkSize = getattr(args, 'size') | |
| 79 | |
| 80 vkSizeAlgo = getattr(args, 'size_algorithm') | |
| 81 | |
| 82 # control predictions | |
| 83 def forecaster(vkInput): | |
| 84 if vkMultiprocessing: | |
| 85 try: | |
| 86 pool = Pool() | |
| 87 vkOutputList = pool.map(featurePrediction, vkInput) | |
| 88 except Exception as e: | |
| 89 print("Error during multirpocessing: "+str(e)) | |
| 90 finally: | |
| 91 pool.close() | |
| 92 pool.join() | |
| 93 else: | |
| 94 vkOutputList = map(featurePrediction, vkInput) | |
| 95 vkOutputList = [x for x in vkOutputList if x is not None] | |
| 96 return(vkOutputList) | |
| 97 | |
| 98 # predict feature formulas and creates output list | |
| 99 def featurePrediction(feature): | |
| 100 if vkNoAdjustment: | |
| 101 mass = feature[2] | |
| 102 else: | |
| 103 mass = adjust(feature[2], feature[1]) # mz & polarity | |
| 104 uncertainty = mass * vkError / 1e6 | |
| 105 prediction = predict(mass, uncertainty, 0, vkMaxIndex) | |
| 106 if prediction != -1: | |
| 107 feature[2] = mass | |
| 108 predictions = predictNeighbors(mass, uncertainty, prediction) | |
| 109 if vkUnique and len(predictions) > 1: | |
| 110 return | |
| 111 feature.append(predictions) # feature[5] | |
| 112 predictionClosest = predictions[0] | |
| 113 formula = predictionClosest[1] | |
| 114 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) | |
| 115 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} | |
| 116 i = 0; | |
| 117 while i < len(formulaList): | |
| 118 if formulaList[i] in formulaDictionary: | |
| 119 # if there is only one of this element | |
| 120 if i+1 == len(formulaList) or formulaList[i+1].isalpha(): | |
| 121 formulaDictionary[formulaList[i]] = 1 | |
| 122 else: | |
| 123 formulaDictionary[formulaList[i]] = formulaList[i+1] | |
| 124 i+=1 | |
| 125 i+=1 | |
| 126 predictionClosest.append(formulaDictionary) | |
| 127 hc = float(formulaDictionary['H'])/float(formulaDictionary['C']) | |
| 128 oc = float(formulaDictionary['O'])/float(formulaDictionary['C']) | |
| 129 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) | |
| 130 predictionClosestDelta = feature[5][0][2] | |
| 131 feature += [predictionClosestDelta, hc, oc, nc] | |
| 132 return(feature) | |
| 133 | |
| 134 # adjust charged mass to a neutral mass | |
| 135 def adjust(mass, polarity): | |
| 136 # value to adjust by | |
| 137 proton = 1.007276 | |
| 138 if polarity == 'positive': | |
| 139 mass += proton | |
| 140 elif polarity == 'negative': | |
| 141 mass -= proton | |
| 142 return mass | |
| 143 | |
| 144 # Binary search to match observed mass to known mass within error | |
| 145 # https://en.wikipedia.org/wiki/Binary_search_tree | |
| 146 def predict(mass, uncertainty, left, right): | |
| 147 mid = ((right - left) / 2) + left | |
| 148 if left <= mid <= right and mid <= vkMaxIndex: | |
| 149 delta = float(vkMass[mid]) - mass | |
| 150 if uncertainty >= abs(delta): | |
| 151 return mid | |
| 152 elif uncertainty < delta: | |
| 153 return predict(mass, uncertainty, left, mid-1) | |
| 154 else: | |
| 155 return predict(mass, uncertainty, mid+1, right) | |
| 156 return -1 | |
| 157 | |
| 158 def plotData(vkData): | |
| 159 if vkSizeAlgo == 0: | |
| 160 for row in vkData: | |
| 161 row.append(vkSize) | |
| 162 else: | |
| 163 max_intensity = 0.0 | |
| 164 for row in vkData: | |
| 165 intensity = row[4] | |
| 166 if intensity > max_intensity: | |
| 167 max_intensity = intensity | |
| 168 alpha = vkSize/math.log(max_intensity+1) | |
| 169 for row in vkData: | |
| 170 intensity = row[4] | |
| 171 row.append(alpha*math.log(intensity+1)) | |
| 172 return vkData | |
| 173 | |
| 174 # find and sort known masses within error limit of observed mass | |
| 175 def predictNeighbors(mass, uncertainty, prediction): | |
| 176 i = 0 | |
| 177 neighbors = [[vkMass[prediction],vkFormula[prediction],(float(vkMass[prediction])-mass)],] | |
| 178 while prediction+i+1 <= vkMaxIndex: | |
| 179 neighbor = prediction+i+1 | |
| 180 delta = float(vkMass[neighbor])-mass | |
| 181 if uncertainty >= abs(delta): | |
| 182 neighbors.append([vkMass[neighbor],vkFormula[neighbor],delta]) | |
| 183 i += 1 | |
| 184 else: | |
| 185 break | |
| 186 i = 0 | |
| 187 while prediction+i-1 >= 0: | |
| 188 neighbor = prediction+i-1 | |
| 189 delta = float(vkMass[neighbor])-mass | |
| 190 if uncertainty >= abs(delta): | |
| 191 neighbors.append([vkMass[neighbor],vkFormula[neighbor],(float(vkMass[neighbor])-mass)]) | |
| 192 i -= 1 | |
| 193 else: | |
| 194 break | |
| 195 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) | |
| 196 return neighbors | |
| 197 | |
| 198 # write output file | |
| 199 def saveForcast(vkOutputList): | |
| 200 try: | |
| 201 with open(vkOutput+'.tsv', 'w') as f: | |
| 202 f.writelines(str("sample_id\tpolarity\tmz\tretention_time\tintensity\tpredictions\tdelta\tH:C\tO:C\tN:C\tsymbol_size") + '\n') | |
| 203 for feature in vkOutputList: | |
| 204 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\t'+str(feature[9])+'\t'+str(feature[10])+'\t'+'\n') | |
| 205 except ValueError: | |
| 206 print('"%s" could not be saved.' % filename) | |
| 207 | |
| 208 def plotRatios(vkData): | |
| 209 max_rt = 0 | |
| 210 max_hc = 0 | |
| 211 max_oc = 0 | |
| 212 max_nc = 0 | |
| 213 for row in vkData: | |
| 214 if row[3] > max_rt: | |
| 215 max_rt = row[3] | |
| 216 if row[7] > max_hc: | |
| 217 max_hc = row[7] | |
| 218 if row[8] > max_oc: | |
| 219 max_oc = row[8] | |
| 220 if row[9] > max_nc: | |
| 221 max_nc = row[9] | |
| 222 labels = ['sampleID', 'polarity', 'mz', 'rt', 'intensity', 'predictions', 'delta', 'hc', 'oc', 'nc', 'symbol_size'] | |
| 223 df = pd.DataFrame.from_records(vkData, columns=labels) | |
| 224 sampleIDs = df.sampleID.unique() | |
| 225 data = [] | |
| 226 menus = [] | |
| 227 i = 0 | |
| 228 for sampleID in sampleIDs: | |
| 229 dfSample = df.loc[df['sampleID'] == sampleID] | |
| 230 size = dfSample.symbol_size | |
| 231 trace = go.Scatter( | |
| 232 x = dfSample.oc, | |
| 233 y = dfSample.hc, | |
| 234 text = dfSample.predictions.apply(lambda x: "Prediction: "+str(x[0][1])+"<br>mz: " +str(x[0][0])+"<br>Delta: "+str(x[0][2])), | |
| 235 line = dict(width = 0.5), | |
| 236 mode = 'markers', | |
| 237 marker = dict( | |
| 238 size = size, | |
| 239 sizemode = "area", | |
| 240 color = dfSample.rt, | |
| 241 colorscale = 'Viridis', | |
| 242 cmin = 0, | |
| 243 cmax = max_rt, | |
| 244 colorbar=dict(title='Retention Time (s)'), | |
| 245 line = dict(width = 0.5), | |
| 246 showscale = True | |
| 247 ), | |
| 248 opacity = 0.8 | |
| 249 ) | |
| 250 data.append(trace) | |
| 251 vision = [] | |
| 252 j = 0 | |
| 253 while j < len(sampleIDs): | |
| 254 if j != i: | |
| 255 vision.append(False) | |
| 256 else: | |
| 257 vision.append(True) | |
| 258 j += 1 | |
| 259 menu = dict( | |
| 260 method = 'update', | |
| 261 label = sampleID, | |
| 262 args = [{'visible': vision}, {'title': sampleID}] | |
| 263 ) | |
| 264 menus.append(menu) | |
| 265 i += 1 | |
| 266 updatemenus = list([ | |
| 267 dict( | |
| 268 active = -1, | |
| 269 buttons = menus | |
| 270 ) | |
| 271 ]) | |
| 272 layout = go.Layout( | |
| 273 title = "Van Krevelen Diagram", | |
| 274 showlegend = False, | |
| 275 xaxis = dict( | |
| 276 title = 'Oxygen to Carbon Ratio', | |
| 277 zeroline = False, | |
| 278 gridcolor = 'rgb(183,183,183)', | |
| 279 showline = True, | |
| 280 range = [0, max_oc] | |
| 281 ), | |
| 282 yaxis = dict( | |
| 283 title = 'Hydrogen to Carbon Ratio', | |
| 284 zeroline = False, | |
| 285 gridcolor = 'rgb(183,183,183)', | |
| 286 showline = True, | |
| 287 range = [0, max_hc] | |
| 288 ), | |
| 289 margin = dict(r=0, b=100, l=100, t=100), | |
| 290 updatemenus = updatemenus | |
| 291 ) | |
| 292 fig = go.Figure(data=data, layout=layout) | |
| 293 py.plot(fig, auto_open=False, show_link=False, filename=vkOutput+'.html') | |
| 294 | |
| 295 def polaritySanitizer(sample_polarity): | 28 def polaritySanitizer(sample_polarity): |
| 296 if sample_polarity.lower() in {'positive','pos','+'}: | 29 if sample_polarity.lower() in {'positive','pos','+'}: |
| 297 sample_polarity = 'positive' | 30 sample_polarity = 'positive' |
| 298 elif sample_polarity.lower() in {'negative', 'neg', '-'}: | 31 elif sample_polarity.lower() in {'negative', 'neg', '-'}: |
| 299 sample_polarity = 'negative' | 32 sample_polarity = 'negative' |
| 300 else: | 33 else: |
| 301 print('A sample has an unknown polarity type: %s. Polarity in the XCMS sample metadata should be set to "negative" or "positive".' % sample_polarity) | 34 print('A sample has an unknown polarity type: %s. Polarity in the XCMS sample metadata should be set to "negative" or "positive".' % sample_polarity) |
| 302 raise ValueError | 35 raise ValueError |
| 303 return sample_polarity | 36 return sample_polarity |
| 304 | 37 |
| 305 # main | 38 # read input |
| 306 if vkInputType == "tsv": | 39 vkInput = [] # each element is a feature from input |
| 307 vkInput = [] | 40 if INPUT_TYPE == "tsv": |
| 308 tsvFile = getattr(args, "input") | 41 tsvFile = getattr(args, "input") |
| 309 try: | 42 try: |
| 310 with open(tsvFile, 'r') as f: | 43 with open(tsvFile, 'r') as f: |
| 311 next(f) # skip hearder line | 44 next(f) # skip hearder line |
| 312 tsvData = csv.reader(f, delimiter='\t') | 45 tsvData = csv.reader(f, delimiter='\t') |
| 313 for row in tsvData: | 46 for row in tsvData: |
| 314 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])]) | 47 vkInput.append([row[0],polaritySanitizer(row[1]),float(row[2]),float(row[3]),float(row[4])]) |
| 315 except ValueError: | 48 except ValueError: |
| 316 print('The %s data file could not be read.' % tsvFile) | 49 print('The %s data file could not be read.' % tsvFile) |
| 317 vkData = forecaster(vkInput) | 50 else: # INPUT_TYPE == "xcms" |
| 318 vkData = plotData(vkData) | |
| 319 saveForcast(vkData) | |
| 320 plotRatios(vkData) | |
| 321 elif vkInputType == "xcms": | |
| 322 vkInput = [] | |
| 323 xcmsSampleMetadataFile = getattr(args, "sample_metadata") | 51 xcmsSampleMetadataFile = getattr(args, "sample_metadata") |
| 324 try: | 52 try: |
| 325 polarity = {} | 53 polarity = {} |
| 326 with open(xcmsSampleMetadataFile, 'r') as f: | 54 with open(xcmsSampleMetadataFile, 'r') as f: |
| 327 xcmsSampleMetadata = csv.reader(f, delimiter='\t') | 55 xcmsSampleMetadata = csv.reader(f, delimiter='\t') |
| 328 next(xcmsSampleMetadata, None) # skip header | 56 next(xcmsSampleMetadata, None) # skip header |
| 329 for row in xcmsSampleMetadata: | 57 for row in xcmsSampleMetadata: |
| 330 sample = row[0] | 58 sample = row[0] |
| 331 if vkPolarity: | 59 if POLARITY: |
| 332 polarity[sample] = vkPolarity | 60 polarity[sample] = POLARITY |
| 333 else: | 61 else: |
| 334 sample_polarity = polaritySanitizer(row[2]) | 62 sample_polarity = polaritySanitizer(row[2]) |
| 335 polarity[sample] = sample_polarity | 63 polarity[sample] = sample_polarity |
| 336 except ValueError: | 64 except ValueError: |
| 337 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) | 65 print('The %s data file could not be read. Check that polarity is set to "negative" or "positive"' % xcmsSampleMetadataFile) |
| 380 if sample != "": | 108 if sample != "": |
| 381 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity)]) | 109 vkInput.append([sample, polarity[sample], mz[variable], rt[variable], float(intensity)]) |
| 382 i+=1 | 110 i+=1 |
| 383 except ValueError: | 111 except ValueError: |
| 384 print('The %s data file could not be read.' % xcmsDataMatrixFile) | 112 print('The %s data file could not be read.' % xcmsDataMatrixFile) |
| 385 vkData = forecaster(vkInput) | 113 |
| 386 vkData = plotData(vkData) | 114 # store||generate remaining constants |
| 387 saveForcast(vkData) | 115 OUTPUT = getattr(args, "output") |
| 388 plotRatios(vkData) | 116 MASS_ERROR = getattr(args, "error") |
| 389 else: | 117 UNIQUE = getattr(args, "unique") |
| 390 vkData = [] | 118 NEUTRAL = getattr(args, "neutral") |
| 391 tsvPlotvFile = getattr(args, "input") | 119 DATABASE = getattr(args, "database") |
| 392 try: | 120 DIRECTORY = getattr(args, "directory") |
| 393 with open(tsvPlotFile, 'r') as f: | 121 MASS = [] |
| 394 next(f) # skip header line | 122 FORMULA = [] |
| 395 plotData = csv.reader(f, delimiter='\t') | 123 try: |
| 396 for row in plotData: | 124 with open(DIRECTORY+DATABASE, 'r') as tsv: |
| 397 vkData.append([row[0],row[1],float(row[2]),float(row[3]),float(row[4]),list(row[4]),float(row[5]),float(row[6]),float(row[7]),float(row[8])]) | 125 for row in tsv: |
| 398 except ValueError: | 126 mass, formula = row.split() |
| 399 print('The %s data file could not be read.' % tsvFile) | 127 MASS.append(mass) |
| 400 plotRatios(vkData) | 128 FORMULA.append(formula) |
| 401 | 129 except ValueError: |
| 130 print('The %s database could not be loaded.' % DATABASE) | |
| 131 MAX_MASS_INDEX = len(MASS)-1 | |
| 132 | |
| 133 # adjust charged mass to a neutral mass | |
| 134 def adjust(mass, polarity): | |
| 135 # value to adjust by | |
| 136 proton = 1.007276 | |
| 137 if polarity == 'positive': | |
| 138 mass -= proton | |
| 139 else: # sanitized to negative | |
| 140 mass += proton | |
| 141 return mass | |
| 142 | |
| 143 # binary search to match a neutral mass to known mass within error | |
| 144 def predict(mass, uncertainty, left, right): | |
| 145 mid = ((right - left) / 2) + left | |
| 146 if left <= mid <= right and mid <= MAX_MASS_INDEX: | |
| 147 delta = float(MASS[mid]) - mass | |
| 148 if uncertainty >= abs(delta): | |
| 149 return mid | |
| 150 elif uncertainty < delta: | |
| 151 return predict(mass, uncertainty, left, mid-1) | |
| 152 else: | |
| 153 return predict(mass, uncertainty, mid+1, right) | |
| 154 return -1 | |
| 155 | |
| 156 # find and rank predictions which are adjacent to the index of an intial prediction | |
| 157 def predictNeighbors(mass, uncertainty, prediction): | |
| 158 i = 0 | |
| 159 neighbors = [[MASS[prediction],FORMULA[prediction],(float(MASS[prediction])-mass)],] | |
| 160 while prediction+i+1 <= MAX_MASS_INDEX: | |
| 161 neighbor = prediction+i+1 | |
| 162 delta = float(MASS[neighbor])-mass | |
| 163 if uncertainty >= abs(delta): | |
| 164 neighbors.append([MASS[neighbor],FORMULA[neighbor],delta]) | |
| 165 i += 1 | |
| 166 else: | |
| 167 break | |
| 168 i = 0 | |
| 169 while prediction+i-1 >= 0: | |
| 170 neighbor = prediction+i-1 | |
| 171 delta = float(MASS[neighbor])-mass | |
| 172 if uncertainty >= abs(delta): | |
| 173 neighbors.append([MASS[neighbor],FORMULA[neighbor],(float(MASS[neighbor])-mass)]) | |
| 174 i -= 1 | |
| 175 else: | |
| 176 break | |
| 177 neighbors = sorted(neighbors, key = (lambda delta: abs(delta[2]))) | |
| 178 return neighbors | |
| 179 | |
| 180 # predict formulas by the mass of a feature | |
| 181 def featurePrediction(feature): | |
| 182 if NEUTRAL: | |
| 183 mass = feature[2] | |
| 184 else: | |
| 185 mass = adjust(feature[2], feature[1]) # mz & polarity | |
| 186 uncertainty = mass * MASS_ERROR / 1e6 | |
| 187 prediction = predict(mass, uncertainty, 0, MAX_MASS_INDEX) | |
| 188 if prediction != -1: # else feature if forgotten | |
| 189 predictions = predictNeighbors(mass, uncertainty, prediction) | |
| 190 if UNIQUE and len(predictions) > 1: | |
| 191 return | |
| 192 feature.append(predictions) # feature[5] | |
| 193 formula = predictions[0][1] # formula of prediction with lowest abs(delta) | |
| 194 formulaList = re.findall('[A-Z][a-z]?|[0-9]+', formula) | |
| 195 formulaDictionary = {'C':0,'H':0,'O':0,'N':0} # other elements are easy to add | |
| 196 i = 0; | |
| 197 while i < len(formulaList): | |
| 198 if formulaList[i] in formulaDictionary: | |
| 199 # if there is only one of this element | |
| 200 if i+1 == len(formulaList) or formulaList[i+1].isalpha(): | |
| 201 formulaDictionary[formulaList[i]] = 1 | |
| 202 else: | |
| 203 formulaDictionary[formulaList[i]] = formulaList[i+1] | |
| 204 i+=1 | |
| 205 i+=1 | |
| 206 hc = float(formulaDictionary['H'])/float(formulaDictionary['C']) | |
| 207 oc = float(formulaDictionary['O'])/float(formulaDictionary['C']) | |
| 208 nc = float(formulaDictionary['N'])/float(formulaDictionary['C']) | |
| 209 feature += [hc, oc, nc] # feature[6], [7], [8] | |
| 210 return(feature) | |
| 211 | |
| 212 # write output file | |
| 213 def write(vkData): | |
| 214 json = '' | |
| 215 try: | |
| 216 # write tabular file and generate json for html output | |
| 217 with open(OUTPUT+'.tsv', 'w') as f: | |
| 218 f.writelines(str("sample_id\tpolarity\tmz\trt\tintensity\tpredictions\thc\toc\tnc") + '\n') | |
| 219 for feature in vkData: | |
| 220 f.writelines(feature[0]+'\t'+feature[1]+'\t'+str(feature[2])+'\t'+str(feature[3])+'\t'+str(feature[4])+'\t'+str(feature[5])+'\t'+str(feature[6])+'\t'+str(feature[7])+'\t'+str(feature[8])+'\n') | |
| 221 json += '{sample_id:\''+str(feature[0])+'\', polarity:\''+str(feature[1])+'\', mz:'+str(feature[2])+', rt:'+str(feature[3])+', intensity:'+str(feature[4])+', predictions:'+str(feature[5])+', hc:'+str(feature[6])+', oc:'+str(feature[7])+', nc:'+str(feature[8])+'},' | |
| 222 json = json[:-1] # remove final comma | |
| 223 # write html | |
| 224 try: | |
| 225 with open(DIRECTORY+'d3.html', 'r') as template, open(OUTPUT+'.html', 'w') as f: | |
| 226 for line in template: | |
| 227 line = re.sub('^var data.*$', 'var data = ['+json+']', line, flags=re.M) | |
| 228 f.write(line) | |
| 229 except ValueError: | |
| 230 print('"%s" could not be read or "%s" could not be written' % template, f) | |
| 231 except ValueError: | |
| 232 print('"%s" could not be saved.' % filename) | |
| 233 | |
| 234 # main | |
| 235 vkData = map(featurePrediction, vkInput) | |
| 236 vkData = [x for x in vkData if x is not None] | |
| 237 # sort by intensity so D3 draws largest symbols first | |
| 238 vkData.sort(key=lambda x: x[4], reverse=True) | |
| 239 write(vkData) |
