comparison anndata_operations.xml @ 26:825dfd66e3fb draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty
author ebi-gxa
date Thu, 16 Feb 2023 13:28:31 +0000
parents 31e5e6d606ef
children 7ebc22f77d86
comparison
equal deleted inserted replaced
25:a36d7a315be7 26:825dfd66e3fb
1 <?xml version="1.0" encoding="utf-8"?> 1 <?xml version="1.0" encoding="utf-8"?>
2 <tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@"> 2 <tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@">
3 <description>modifies metadata and flags genes</description> 3 <description>modifies metadata and flags genes</description>
4 <macros> 4 <macros>
5 <import>scanpy_macros2.xml</import> 5 <import>scanpy_macros2.xml</import>
6 </macros> 6 </macros>
7 <expand macro="requirements"/> 7 <expand macro="requirements"/>
43 #end if 43 #end if
44 python $operations 44 python $operations
45 ]]></command> 45 ]]></command>
46 <configfiles> 46 <configfiles>
47 <configfile name="operations"> 47 <configfile name="operations">
48 import gc
48 import scanpy as sc 49 import scanpy as sc
49 import anndata 50 import anndata
50 from numpy import all 51 from numpy import all
51 import logging 52 import logging
52 53
54 def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'):
55 if new_field is None:
56 new_field = f"{field}_u"
57 appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '')
58 df[new_field] = df[field].astype(str) + appendents.astype(str)
59 return df
60
53 adata = sc.read('input.h5') 61 adata = sc.read('input.h5')
54 62
55 #if $copy_adata_to_raw: 63 #if $copy_adata_to_raw:
56 adata.raw = adata 64 adata.raw = adata
57 #end if 65 #end if
58 66
59 gene_name = '${gene_symbols_field}' 67 gene_name = '${gene_symbols_field}'
60 qc_vars = list() 68 qc_vars = list()
61 69
62 #for $i, $s in enumerate($modifications) 70 #for $i, $s in enumerate($modifications)
71 #if $s.make_unique:
72 adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d")
73 #else
63 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}'] 74 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}']
75 #end if
64 #if not $s.keep_original: 76 #if not $s.keep_original:
65 del adata.obs['${s.from_obs}'] 77 del adata.obs['${s.from_obs}']
66 #end if 78 #end if
67 #end for 79 #end for
68 80
69 #for $i, $s in enumerate($var_modifications) 81 #for $i, $s in enumerate($var_modifications)
82 #if $s.make_unique:
83 adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d")
84 #else
70 adata.var['${s.to_var}'] = adata.var['${s.from_var}'] 85 adata.var['${s.to_var}'] = adata.var['${s.from_var}']
86 #end if
71 #if not $s.keep_original: 87 #if not $s.keep_original:
72 del adata.var['${s.from_var}'] 88 del adata.var['${s.from_var}']
73 #end if 89 #end if
74 #end for 90 #end for
75 91
82 qc_vars.append('${flag.flag}') 98 qc_vars.append('${flag.flag}')
83 else: 99 else:
84 logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}')) 100 logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}'))
85 #end for 101 #end for
86 102
103 #if $field_unique:
104 field_unique = '${field_unique}'
105 made_unique = 0
106 if field_unique in adata.var_keys():
107 adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d")
108 made_unique += 1
109 if field_unique in adata.obs_keys():
110 adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d")
111 made_unique += 1
112
113 if made_unique == 0:
114 logging.error("Specified field to be made unique is not in var or obs.")
115 sys.exit(1)
116 #end if
117
87 #if $copy_r.default and $copy_r.r_source: 118 #if $copy_r.default and $copy_r.r_source:
88 ad_s = sc.read('r_source.h5') 119 ad_s = sc.read('r_source.h5')
89 if not all(adata.obs.index.isin(ad_s.obs.index)): 120 if not all(adata.obs.index.isin(ad_s.obs.index)):
90 logging.error("Specified object for .raw must contain all .obs from main object.") 121 logging.error("Specified object for .raw must contain all .obs from main object.")
91 sys.exit(1) 122 sys.exit(1)
92 else: 123 else:
93 adata.raw = ad_s[adata.obs.index] 124 adata.raw = ad_s[adata.obs.index]
94 del ad_s 125 del ad_s
126 gc.collect()
95 #end if 127 #end if
96 128
97 #if $copy_x.default and len($copy_x.xlayers) > 0: 129 #if $copy_x.default and len($copy_x.xlayers) > 0:
98 #for $i, $x_s in enumerate($copy_x.xlayers): 130 #for $i, $x_s in enumerate($copy_x.xlayers):
99 ad_s = sc.read('x_source_${i}.h5') 131 ad_s = sc.read('x_source_${i}.h5')
105 adata.layers["${xs.dest}"] = ad_s.X 137 adata.layers["${xs.dest}"] = ad_s.X
106 else: 138 else:
107 logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") 139 logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
108 sys.exit(1) 140 sys.exit(1)
109 del ad_s 141 del ad_s
142 gc.collect()
110 #end for 143 #end for
111 #end if 144 #end if
112 145
113 #if $copy_l.default and len($copy_l.layers) > 0: 146 #if $copy_l.default and len($copy_l.layers) > 0:
114 #for $i, $layer_s in enumerate($copy_l.layer_sources): 147 #for $i, $layer_s in enumerate($copy_l.layer_sources):
118 layers_to_copy = (k for k in ad_s.layers.keys() if "${l_key.contains}" in k) 151 layers_to_copy = (k for k in ad_s.layers.keys() if "${l_key.contains}" in k)
119 for l_to_copy in layers_to_copy: 152 for l_to_copy in layers_to_copy:
120 suffix='' 153 suffix=''
121 if l_to_copy in adata.layers: 154 if l_to_copy in adata.layers:
122 suffix = "_${i}" 155 suffix = "_${i}"
123
124 adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy] 156 adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy]
125 #end for 157 #end for
126 else: 158 else:
127 logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") 159 logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
128 sys.exit(1) 160 sys.exit(1)
129 del ad_s 161 del ad_s
162 gc.collect()
130 #end for 163 #end for
131 #end if 164 #end if
132 165
133 #if $copy_o.default and len($copy_o.obs_keys) > 0: 166 #if $copy_o.default and len($copy_o.obs_keys) > 0:
134 #for $i, $obs_s in enumerate($copy_o.obs_sources): 167 #for $i, $obs_s in enumerate($copy_o.obs_sources):
147 #end for 180 #end for
148 else: 181 else:
149 logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") 182 logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
150 sys.exit(1) 183 sys.exit(1)
151 del ad_s 184 del ad_s
185 gc.collect()
152 #end for 186 #end for
153 #end if 187 #end if
154 188
155 189
156 #if $copy_e.default and len($copy_e.embedding_keys) > 0: 190 #if $copy_e.default and len($copy_e.embedding_keys) > 0:
167 #end for 201 #end for
168 else: 202 else:
169 logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") 203 logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
170 sys.exit(1) 204 sys.exit(1)
171 del ad_s 205 del ad_s
206 gc.collect()
172 #end for 207 #end for
173 #end if 208 #end if
174 209
175 #if $copy_u.default and len($copy_u.uns_keys) > 0: 210 #if $copy_u.default and len($copy_u.uns_keys) > 0:
176 #for $i, $uns_s in enumerate($copy_u.uns_sources): 211 #for $i, $uns_s in enumerate($copy_u.uns_sources):
186 #end for 221 #end for
187 else: 222 else:
188 logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") 223 logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
189 sys.exit(1) 224 sys.exit(1)
190 del ad_s 225 del ad_s
226 gc.collect()
191 #end for 227 #end for
192 #end if 228 #end if
193 229
194 #if $sanitize_varm: 230 #if $sanitize_varm:
195 if hasattr(adata, 'raw') and hasattr(adata.raw, 'X') and hasattr(adata.raw, 'var'): 231 if hasattr(adata, 'raw') and hasattr(adata.raw, 'X') and hasattr(adata.raw, 'var'):
222 <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change"> 258 <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change">
223 <sanitizer> 259 <sanitizer>
224 <valid initial="string.printable"/> 260 <valid initial="string.printable"/>
225 </sanitizer> 261 </sanitizer>
226 </param> 262 </param>
227 <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change"/> 263 <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change to"/>
228 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> 264 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
265 <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
229 </repeat> 266 </repeat>
230 <repeat name="var_modifications" title="Change field names in AnnData var" min="0"> 267 <repeat name="var_modifications" title="Change field names in AnnData var" min="0">
231 <param name="from_var" type="text" label="Original name" help="Name in var that you want to change"> 268 <param name="from_var" type="text" label="Original name" help="Name in var that you want to change">
232 <sanitizer> 269 <sanitizer>
233 <valid initial="string.printable"/> 270 <valid initial="string.printable"/>
234 </sanitizer> 271 </sanitizer>
235 </param> 272 </param>
236 <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/> 273 <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/>
237 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> 274 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/>
275 <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/>
238 </repeat> 276 </repeat>
239 <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/> 277 <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/>
240 <repeat name="gene_flags" title="Flag genes that start with these names"> 278 <repeat name="gene_flags" title="Flag genes that start with these names">
241 <param name="startswith" type="text" label="Starts with" help="Text that you expect the genes to be flagged to start with, such as 'MT-' for mito genes"/> 279 <param name="startswith" type="text" label="Starts with" help="Text that you expect the genes to be flagged to start with, such as 'MT-' for mito genes"/>
242 <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/> 280 <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/>
243 </repeat> 281 </repeat>
244 <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/> 282 <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/>
283 <param name="field_unique" type="text" optional="true" label="Field in var or obs to make unique" help="Field inside var or obs to be made unique by appending a suffix (useful for gene symbols in var). A new field will be added with the '_u' suffix. It happens after all the above operations."/>
245 <conditional name="copy_r"> 284 <conditional name="copy_r">
246 <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/> 285 <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/>
247 <when value="true"> 286 <when value="true">
248 <param name="r_source" type="data" label="AnnData object .X with to copy to .raw" help="Copies adata (subset to matching obs) from this AnnData object into the main input as .raw. Make sure to use an AnnData object containing all .obs in the main input." format="h5,h5ad" /> 287 <param name="r_source" type="data" label="AnnData object .X with to copy to .raw" help="Copies adata (subset to matching obs) from this AnnData object into the main input as .raw. Make sure to use an AnnData object containing all .obs in the main input." format="h5,h5ad" />
249 </when> 288 </when>
308 </outputs> 347 </outputs>
309 348
310 <tests> 349 <tests>
311 <test> 350 <test>
312 <param name="input_obj_file" value="find_cluster.h5"/> 351 <param name="input_obj_file" value="find_cluster.h5"/>
313 <param name="input_format" value="anndata"/>
314 <param name="color_by" value="louvain"/>
315 <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/> 352 <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/>
316 </test> 353 </test>
317 <test> 354 <test>
318 <param name="input_obj_file" value="anndata_ops.h5"/> 355 <param name="input_obj_file" value="anndata_ops.h5"/>
319 <param name="from_var" value = "gene_symbols" /> 356 <param name="from_var" value = "gene_symbols" />
323 <has_h5_keys keys="var/hello_all" /> 360 <has_h5_keys keys="var/hello_all" />
324 </assert_contents> 361 </assert_contents>
325 </output> 362 </output>
326 </test> 363 </test>
327 <test> 364 <test>
365 <param name="input_obj_file" value="anndata_ops.h5"/>
366 <repeat name="var_modifications" >
367 <param name="from_var" value = "gene_symbols" />
368 <param name="to_var" value = "gene_symbols_unique" />
369 <param name="make_unique" value = "True" />
370 </repeat>
371 <output name="output_h5ad" ftype="h5ad">
372 <assert_contents>
373 <has_h5_keys keys="var/gene_symbols_unique" />
374 </assert_contents>
375 </output>
376 </test>
377 <test>
378 <param name="input_obj_file" value="anndata_ops.h5"/>
379 <param name="field_unique" value = "gene_symbols" />
380 <output name="output_h5ad" ftype="h5ad">
381 <assert_contents>
382 <has_h5_keys keys="var/gene_symbols_u" />
383 </assert_contents>
384 </output>
385 </test>
386 <test>
328 <param name="input_obj_file" value="find_cluster.h5"/> 387 <param name="input_obj_file" value="find_cluster.h5"/>
329 <param name="input_format" value="anndata"/>
330 <conditional name="copy_r"> 388 <conditional name="copy_r">
331 <param name="default" value="true"/> 389 <param name="default" value="true"/>
332 <param name="r_source" value="read_10x.h5"/> 390 <param name="r_source" value="read_10x.h5"/>
333 </conditional> 391 </conditional>
334 <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size"> 392 <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size">
335 <assert_contents> 393 <assert_contents>
336 <has_h5_keys keys="raw/X" /> 394 <has_h5_keys keys="raw/X" />
337 </assert_contents> 395 </assert_contents>
338 </output> 396 </output>
339 </test> 397 </test>
340 <test> 398 <test>
341 <param name="input_obj_file" value="normalise_data.h5"/> 399 <param name="input_obj_file" value="normalise_data.h5"/>
342 <param name="input_format" value="anndata"/>
343 <conditional name="copy_x"> 400 <conditional name="copy_x">
344 <param name="default" value="true"/> 401 <param name="default" value="true"/>
345 <repeat name="xlayers"> 402 <repeat name="xlayers">
346 <param name="x_source" value='filter_genes.h5'/> 403 <param name="x_source" value='filter_genes.h5'/>
347 <param name="dest" value='filtered'/> 404 <param name="dest" value='filtered'/>
348 </repeat> 405 </repeat>
349 </conditional> 406 </conditional>
350 <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size"> 407 <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size">
351 <assert_contents> 408 <assert_contents>
352 <has_h5_keys keys="layers/filtered" /> 409 <has_h5_keys keys="layers/filtered" />
353 </assert_contents> 410 </assert_contents>
354 </output> 411 </output>
355 </test> 412 </test>
356 <test> 413 <test>
357 <param name="input_obj_file" value="find_cluster.h5"/> 414 <param name="input_obj_file" value="find_cluster.h5"/>
358 <param name="input_format" value="anndata"/>
359 <conditional name="copy_l"> 415 <conditional name="copy_l">
360 <param name="default" value="true"/> 416 <param name="default" value="true"/>
361 <repeat name="layers"> 417 <repeat name="layers">
362 <param name="contains" value='filtered'/> 418 <param name="contains" value='filtered'/>
363 </repeat> 419 </repeat>
364 <param name="layer_sources" value='anndata_ops_xlayer.h5'/> 420 <param name="layer_sources" value='anndata_ops_xlayer.h5'/>
365 </conditional> 421 </conditional>
366 <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size"> 422 <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size">
367 <assert_contents> 423 <assert_contents>
368 <has_h5_keys keys="layers/filtered" /> 424 <has_h5_keys keys="layers/filtered" />
369 </assert_contents> 425 </assert_contents>
370 </output> 426 </output>
371 </test> 427 </test>
372 </tests> 428 </tests>
373 429
374 <help><![CDATA[ 430 <help><![CDATA[
376 Operations on AnnData objects 432 Operations on AnnData objects
377 ============================= 433 =============================
378 434
379 Performs the following operations: 435 Performs the following operations:
380 436
381 * Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one. 437 * Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once.
382 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes. 438 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes.
383 * For the flags created, calculates qc metrics (pct_<flag>_counts). 439 * For the flags created, calculates qc metrics (pct_<flag>_counts).
384 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes. 440 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes.
385 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes). 441 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes).
442 * Make a specified column of var or obs unique (normally useful for gene symbols).
386 * Copy from a set of compatible AnnData objects (same cells and genes): 443 * Copy from a set of compatible AnnData objects (same cells and genes):
387 * Observations, such as clustering results. 444 * Observations, such as clustering results.
388 * Embeddings, such as tSNE or UMAPs. 445 * Embeddings, such as tSNE or UMAPs.
389 * Unstructure annotations, like gene markers. 446 * Unstructure annotations, like gene markers.
390 447
391 This functionality will probably be added in the future to a larger package. 448 This functionality will probably be added in the future to a larger package.
392 449
393 History 450 History
394 ------- 451 -------
452 1.8.1+galaxy10: Adds field to be made unique in obs or var.
395 453
396 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools. 454 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools.
397 455
398 0.0.3+galaxy0: Adds ability to merge AnnData objects (Scanpy 1.4.3). 456 0.0.3+galaxy0: Adds ability to merge AnnData objects (Scanpy 1.4.3).
399 ]]></help> 457 ]]></help>