Mercurial > repos > ebi-gxa > anndata_ops
comparison anndata_operations.xml @ 26:825dfd66e3fb draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/tertiary-analysis/scanpy commit 6c9d530aa653101e9e21804393ec11f38cddf027-dirty
author | ebi-gxa |
---|---|
date | Thu, 16 Feb 2023 13:28:31 +0000 |
parents | 31e5e6d606ef |
children | 7ebc22f77d86 |
comparison
equal
deleted
inserted
replaced
25:a36d7a315be7 | 26:825dfd66e3fb |
---|---|
1 <?xml version="1.0" encoding="utf-8"?> | 1 <?xml version="1.0" encoding="utf-8"?> |
2 <tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@"> | 2 <tool id="anndata_ops" name="AnnData Operations" version="@TOOL_VERSION@+galaxy9" profile="@PROFILE@"> |
3 <description>modifies metadata and flags genes</description> | 3 <description>modifies metadata and flags genes</description> |
4 <macros> | 4 <macros> |
5 <import>scanpy_macros2.xml</import> | 5 <import>scanpy_macros2.xml</import> |
6 </macros> | 6 </macros> |
7 <expand macro="requirements"/> | 7 <expand macro="requirements"/> |
43 #end if | 43 #end if |
44 python $operations | 44 python $operations |
45 ]]></command> | 45 ]]></command> |
46 <configfiles> | 46 <configfiles> |
47 <configfile name="operations"> | 47 <configfile name="operations"> |
48 import gc | |
48 import scanpy as sc | 49 import scanpy as sc |
49 import anndata | 50 import anndata |
50 from numpy import all | 51 from numpy import all |
51 import logging | 52 import logging |
52 | 53 |
54 def make_column_values_unique(df, field, new_field=None, suffix = '-duplicate-'): | |
55 if new_field is None: | |
56 new_field = f"{field}_u" | |
57 appendents = (suffix + df.groupby(field).cumcount().astype(str).replace('0','')).replace(suffix, '') | |
58 df[new_field] = df[field].astype(str) + appendents.astype(str) | |
59 return df | |
60 | |
53 adata = sc.read('input.h5') | 61 adata = sc.read('input.h5') |
54 | 62 |
55 #if $copy_adata_to_raw: | 63 #if $copy_adata_to_raw: |
56 adata.raw = adata | 64 adata.raw = adata |
57 #end if | 65 #end if |
58 | 66 |
59 gene_name = '${gene_symbols_field}' | 67 gene_name = '${gene_symbols_field}' |
60 qc_vars = list() | 68 qc_vars = list() |
61 | 69 |
62 #for $i, $s in enumerate($modifications) | 70 #for $i, $s in enumerate($modifications) |
71 #if $s.make_unique: | |
72 adata.obs = make_column_values_unique(adata.obs, field='${s.from_obs}', new_field='${s.to_obs}', suffix = "_d") | |
73 #else | |
63 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}'] | 74 adata.obs['${s.to_obs}'] = adata.obs['${s.from_obs}'] |
75 #end if | |
64 #if not $s.keep_original: | 76 #if not $s.keep_original: |
65 del adata.obs['${s.from_obs}'] | 77 del adata.obs['${s.from_obs}'] |
66 #end if | 78 #end if |
67 #end for | 79 #end for |
68 | 80 |
69 #for $i, $s in enumerate($var_modifications) | 81 #for $i, $s in enumerate($var_modifications) |
82 #if $s.make_unique: | |
83 adata.var = make_column_values_unique(adata.var, field='${s.from_var}', new_field='${s.to_var}', suffix = "_d") | |
84 #else | |
70 adata.var['${s.to_var}'] = adata.var['${s.from_var}'] | 85 adata.var['${s.to_var}'] = adata.var['${s.from_var}'] |
86 #end if | |
71 #if not $s.keep_original: | 87 #if not $s.keep_original: |
72 del adata.var['${s.from_var}'] | 88 del adata.var['${s.from_var}'] |
73 #end if | 89 #end if |
74 #end for | 90 #end for |
75 | 91 |
82 qc_vars.append('${flag.flag}') | 98 qc_vars.append('${flag.flag}') |
83 else: | 99 else: |
84 logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}')) | 100 logging.warning('No genes starting with {} found, skip calculating expression of {} genes'.format('${flag.startswith}', '${flag.flag}')) |
85 #end for | 101 #end for |
86 | 102 |
103 #if $field_unique: | |
104 field_unique = '${field_unique}' | |
105 made_unique = 0 | |
106 if field_unique in adata.var_keys(): | |
107 adata.var = make_column_values_unique(adata.var, field_unique, suffix = "_d") | |
108 made_unique += 1 | |
109 if field_unique in adata.obs_keys(): | |
110 adata.obs = make_column_values_unique(adata.obs, field_unique, suffix = "_d") | |
111 made_unique += 1 | |
112 | |
113 if made_unique == 0: | |
114 logging.error("Specified field to be made unique is not in var or obs.") | |
115 sys.exit(1) | |
116 #end if | |
117 | |
87 #if $copy_r.default and $copy_r.r_source: | 118 #if $copy_r.default and $copy_r.r_source: |
88 ad_s = sc.read('r_source.h5') | 119 ad_s = sc.read('r_source.h5') |
89 if not all(adata.obs.index.isin(ad_s.obs.index)): | 120 if not all(adata.obs.index.isin(ad_s.obs.index)): |
90 logging.error("Specified object for .raw must contain all .obs from main object.") | 121 logging.error("Specified object for .raw must contain all .obs from main object.") |
91 sys.exit(1) | 122 sys.exit(1) |
92 else: | 123 else: |
93 adata.raw = ad_s[adata.obs.index] | 124 adata.raw = ad_s[adata.obs.index] |
94 del ad_s | 125 del ad_s |
126 gc.collect() | |
95 #end if | 127 #end if |
96 | 128 |
97 #if $copy_x.default and len($copy_x.xlayers) > 0: | 129 #if $copy_x.default and len($copy_x.xlayers) > 0: |
98 #for $i, $x_s in enumerate($copy_x.xlayers): | 130 #for $i, $x_s in enumerate($copy_x.xlayers): |
99 ad_s = sc.read('x_source_${i}.h5') | 131 ad_s = sc.read('x_source_${i}.h5') |
105 adata.layers["${xs.dest}"] = ad_s.X | 137 adata.layers["${xs.dest}"] = ad_s.X |
106 else: | 138 else: |
107 logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") | 139 logging.error("X source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") |
108 sys.exit(1) | 140 sys.exit(1) |
109 del ad_s | 141 del ad_s |
142 gc.collect() | |
110 #end for | 143 #end for |
111 #end if | 144 #end if |
112 | 145 |
113 #if $copy_l.default and len($copy_l.layers) > 0: | 146 #if $copy_l.default and len($copy_l.layers) > 0: |
114 #for $i, $layer_s in enumerate($copy_l.layer_sources): | 147 #for $i, $layer_s in enumerate($copy_l.layer_sources): |
118 layers_to_copy = (k for k in ad_s.layers.keys() if "${l_key.contains}" in k) | 151 layers_to_copy = (k for k in ad_s.layers.keys() if "${l_key.contains}" in k) |
119 for l_to_copy in layers_to_copy: | 152 for l_to_copy in layers_to_copy: |
120 suffix='' | 153 suffix='' |
121 if l_to_copy in adata.layers: | 154 if l_to_copy in adata.layers: |
122 suffix = "_${i}" | 155 suffix = "_${i}" |
123 | |
124 adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy] | 156 adata.layers[l_to_copy+suffix] = ad_s.layers[l_to_copy] |
125 #end for | 157 #end for |
126 else: | 158 else: |
127 logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") | 159 logging.error("Layer source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") |
128 sys.exit(1) | 160 sys.exit(1) |
129 del ad_s | 161 del ad_s |
162 gc.collect() | |
130 #end for | 163 #end for |
131 #end if | 164 #end if |
132 | 165 |
133 #if $copy_o.default and len($copy_o.obs_keys) > 0: | 166 #if $copy_o.default and len($copy_o.obs_keys) > 0: |
134 #for $i, $obs_s in enumerate($copy_o.obs_sources): | 167 #for $i, $obs_s in enumerate($copy_o.obs_sources): |
147 #end for | 180 #end for |
148 else: | 181 else: |
149 logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") | 182 logging.error("Observation source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") |
150 sys.exit(1) | 183 sys.exit(1) |
151 del ad_s | 184 del ad_s |
185 gc.collect() | |
152 #end for | 186 #end for |
153 #end if | 187 #end if |
154 | 188 |
155 | 189 |
156 #if $copy_e.default and len($copy_e.embedding_keys) > 0: | 190 #if $copy_e.default and len($copy_e.embedding_keys) > 0: |
167 #end for | 201 #end for |
168 else: | 202 else: |
169 logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") | 203 logging.error("Embedding source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") |
170 sys.exit(1) | 204 sys.exit(1) |
171 del ad_s | 205 del ad_s |
206 gc.collect() | |
172 #end for | 207 #end for |
173 #end if | 208 #end if |
174 | 209 |
175 #if $copy_u.default and len($copy_u.uns_keys) > 0: | 210 #if $copy_u.default and len($copy_u.uns_keys) > 0: |
176 #for $i, $uns_s in enumerate($copy_u.uns_sources): | 211 #for $i, $uns_s in enumerate($copy_u.uns_sources): |
186 #end for | 221 #end for |
187 else: | 222 else: |
188 logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") | 223 logging.error("Uns source ${i} AnnData file is not compatible to be merged to main AnnData file, different cell names.") |
189 sys.exit(1) | 224 sys.exit(1) |
190 del ad_s | 225 del ad_s |
226 gc.collect() | |
191 #end for | 227 #end for |
192 #end if | 228 #end if |
193 | 229 |
194 #if $sanitize_varm: | 230 #if $sanitize_varm: |
195 if hasattr(adata, 'raw') and hasattr(adata.raw, 'X') and hasattr(adata.raw, 'var'): | 231 if hasattr(adata, 'raw') and hasattr(adata.raw, 'X') and hasattr(adata.raw, 'var'): |
222 <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change"> | 258 <param name="from_obs" type="text" label="Original name" help="Name in observations that you want to change"> |
223 <sanitizer> | 259 <sanitizer> |
224 <valid initial="string.printable"/> | 260 <valid initial="string.printable"/> |
225 </sanitizer> | 261 </sanitizer> |
226 </param> | 262 </param> |
227 <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change"/> | 263 <param name="to_obs" type="text" label="New name" help="New name in observations that you want to change to"/> |
228 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> | 264 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> |
265 <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/> | |
229 </repeat> | 266 </repeat> |
230 <repeat name="var_modifications" title="Change field names in AnnData var" min="0"> | 267 <repeat name="var_modifications" title="Change field names in AnnData var" min="0"> |
231 <param name="from_var" type="text" label="Original name" help="Name in var that you want to change"> | 268 <param name="from_var" type="text" label="Original name" help="Name in var that you want to change"> |
232 <sanitizer> | 269 <sanitizer> |
233 <valid initial="string.printable"/> | 270 <valid initial="string.printable"/> |
234 </sanitizer> | 271 </sanitizer> |
235 </param> | 272 </param> |
236 <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/> | 273 <param name="to_var" type="text" label="New name" help="New name in var that you want to change"/> |
237 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> | 274 <param name="keep_original" type="boolean" label="Keep original" help="If activated, it will also keep the original column" checked="false"/> |
275 <param name="make_unique" type="boolean" label="Make values in the field unique" help="If activated, it will make the values in the column unique by appending '_dnum' on each repeated value." checked="false"/> | |
238 </repeat> | 276 </repeat> |
239 <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/> | 277 <param name="gene_symbols_field" value='index' type="text" label="Gene symbols field in AnnData" help="Field inside var.params where the gene symbols are, normally 'index' or 'gene_symbols'"/> |
240 <repeat name="gene_flags" title="Flag genes that start with these names"> | 278 <repeat name="gene_flags" title="Flag genes that start with these names"> |
241 <param name="startswith" type="text" label="Starts with" help="Text that you expect the genes to be flagged to start with, such as 'MT-' for mito genes"/> | 279 <param name="startswith" type="text" label="Starts with" help="Text that you expect the genes to be flagged to start with, such as 'MT-' for mito genes"/> |
242 <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/> | 280 <param name="flag" type="text" label="Var name" help="Name of the column in var.names where this boolean flag is stored, for example 'mito' for mitochondrial genes."/> |
243 </repeat> | 281 </repeat> |
244 <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/> | 282 <param name="top_genes" label="Number of top genes" value='50' help="to calculate percentage of the flagged genes in that number of top genes. Used by sc.pp.calculate_qc_metrics (integer)." type="integer"/> |
283 <param name="field_unique" type="text" optional="true" label="Field in var or obs to make unique" help="Field inside var or obs to be made unique by appending a suffix (useful for gene symbols in var). A new field will be added with the '_u' suffix. It happens after all the above operations."/> | |
245 <conditional name="copy_r"> | 284 <conditional name="copy_r"> |
246 <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/> | 285 <param name="default" type="boolean" checked="false" label="Copy adata.X to adata.raw"/> |
247 <when value="true"> | 286 <when value="true"> |
248 <param name="r_source" type="data" label="AnnData object .X with to copy to .raw" help="Copies adata (subset to matching obs) from this AnnData object into the main input as .raw. Make sure to use an AnnData object containing all .obs in the main input." format="h5,h5ad" /> | 287 <param name="r_source" type="data" label="AnnData object .X with to copy to .raw" help="Copies adata (subset to matching obs) from this AnnData object into the main input as .raw. Make sure to use an AnnData object containing all .obs in the main input." format="h5,h5ad" /> |
249 </when> | 288 </when> |
308 </outputs> | 347 </outputs> |
309 | 348 |
310 <tests> | 349 <tests> |
311 <test> | 350 <test> |
312 <param name="input_obj_file" value="find_cluster.h5"/> | 351 <param name="input_obj_file" value="find_cluster.h5"/> |
313 <param name="input_format" value="anndata"/> | |
314 <param name="color_by" value="louvain"/> | |
315 <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/> | 352 <output name="output_h5ad" file="anndata_ops.h5" ftype="h5ad" compare="sim_size"/> |
316 </test> | 353 </test> |
317 <test> | 354 <test> |
318 <param name="input_obj_file" value="anndata_ops.h5"/> | 355 <param name="input_obj_file" value="anndata_ops.h5"/> |
319 <param name="from_var" value = "gene_symbols" /> | 356 <param name="from_var" value = "gene_symbols" /> |
323 <has_h5_keys keys="var/hello_all" /> | 360 <has_h5_keys keys="var/hello_all" /> |
324 </assert_contents> | 361 </assert_contents> |
325 </output> | 362 </output> |
326 </test> | 363 </test> |
327 <test> | 364 <test> |
365 <param name="input_obj_file" value="anndata_ops.h5"/> | |
366 <repeat name="var_modifications" > | |
367 <param name="from_var" value = "gene_symbols" /> | |
368 <param name="to_var" value = "gene_symbols_unique" /> | |
369 <param name="make_unique" value = "True" /> | |
370 </repeat> | |
371 <output name="output_h5ad" ftype="h5ad"> | |
372 <assert_contents> | |
373 <has_h5_keys keys="var/gene_symbols_unique" /> | |
374 </assert_contents> | |
375 </output> | |
376 </test> | |
377 <test> | |
378 <param name="input_obj_file" value="anndata_ops.h5"/> | |
379 <param name="field_unique" value = "gene_symbols" /> | |
380 <output name="output_h5ad" ftype="h5ad"> | |
381 <assert_contents> | |
382 <has_h5_keys keys="var/gene_symbols_u" /> | |
383 </assert_contents> | |
384 </output> | |
385 </test> | |
386 <test> | |
328 <param name="input_obj_file" value="find_cluster.h5"/> | 387 <param name="input_obj_file" value="find_cluster.h5"/> |
329 <param name="input_format" value="anndata"/> | |
330 <conditional name="copy_r"> | 388 <conditional name="copy_r"> |
331 <param name="default" value="true"/> | 389 <param name="default" value="true"/> |
332 <param name="r_source" value="read_10x.h5"/> | 390 <param name="r_source" value="read_10x.h5"/> |
333 </conditional> | 391 </conditional> |
334 <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size"> | 392 <output name="output_h5ad" file="anndata_ops_raw.h5" ftype="h5ad" compare="sim_size"> |
335 <assert_contents> | 393 <assert_contents> |
336 <has_h5_keys keys="raw/X" /> | 394 <has_h5_keys keys="raw/X" /> |
337 </assert_contents> | 395 </assert_contents> |
338 </output> | 396 </output> |
339 </test> | 397 </test> |
340 <test> | 398 <test> |
341 <param name="input_obj_file" value="normalise_data.h5"/> | 399 <param name="input_obj_file" value="normalise_data.h5"/> |
342 <param name="input_format" value="anndata"/> | |
343 <conditional name="copy_x"> | 400 <conditional name="copy_x"> |
344 <param name="default" value="true"/> | 401 <param name="default" value="true"/> |
345 <repeat name="xlayers"> | 402 <repeat name="xlayers"> |
346 <param name="x_source" value='filter_genes.h5'/> | 403 <param name="x_source" value='filter_genes.h5'/> |
347 <param name="dest" value='filtered'/> | 404 <param name="dest" value='filtered'/> |
348 </repeat> | 405 </repeat> |
349 </conditional> | 406 </conditional> |
350 <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size"> | 407 <output name="output_h5ad" file="anndata_ops_xlayer.h5" ftype="h5ad" compare="sim_size"> |
351 <assert_contents> | 408 <assert_contents> |
352 <has_h5_keys keys="layers/filtered" /> | 409 <has_h5_keys keys="layers/filtered" /> |
353 </assert_contents> | 410 </assert_contents> |
354 </output> | 411 </output> |
355 </test> | 412 </test> |
356 <test> | 413 <test> |
357 <param name="input_obj_file" value="find_cluster.h5"/> | 414 <param name="input_obj_file" value="find_cluster.h5"/> |
358 <param name="input_format" value="anndata"/> | |
359 <conditional name="copy_l"> | 415 <conditional name="copy_l"> |
360 <param name="default" value="true"/> | 416 <param name="default" value="true"/> |
361 <repeat name="layers"> | 417 <repeat name="layers"> |
362 <param name="contains" value='filtered'/> | 418 <param name="contains" value='filtered'/> |
363 </repeat> | 419 </repeat> |
364 <param name="layer_sources" value='anndata_ops_xlayer.h5'/> | 420 <param name="layer_sources" value='anndata_ops_xlayer.h5'/> |
365 </conditional> | 421 </conditional> |
366 <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size"> | 422 <output name="output_h5ad" file="anndata_ops_layer.h5" ftype="h5ad" compare="sim_size"> |
367 <assert_contents> | 423 <assert_contents> |
368 <has_h5_keys keys="layers/filtered" /> | 424 <has_h5_keys keys="layers/filtered" /> |
369 </assert_contents> | 425 </assert_contents> |
370 </output> | 426 </output> |
371 </test> | 427 </test> |
372 </tests> | 428 </tests> |
373 | 429 |
374 <help><![CDATA[ | 430 <help><![CDATA[ |
376 Operations on AnnData objects | 432 Operations on AnnData objects |
377 ============================= | 433 ============================= |
378 | 434 |
379 Performs the following operations: | 435 Performs the following operations: |
380 | 436 |
381 * Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed as one. | 437 * Change observation/var fields, mostly for downstreaming processes convenience. Multiple fields can be changed at once. |
382 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes. | 438 * Flag genes that start with a certain text: useful for flagging mitochondrial, spikes or other groups of genes. |
383 * For the flags created, calculates qc metrics (pct_<flag>_counts). | 439 * For the flags created, calculates qc metrics (pct_<flag>_counts). |
384 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes. | 440 * Calculates `n_genes`, `n_counts` for cells and `n_cells`, `n_counts` for genes. |
385 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes). | 441 * For top <N> genes specified, calculate qc metrics (pct_counts_in_top_<N>_genes). |
442 * Make a specified column of var or obs unique (normally useful for gene symbols). | |
386 * Copy from a set of compatible AnnData objects (same cells and genes): | 443 * Copy from a set of compatible AnnData objects (same cells and genes): |
387 * Observations, such as clustering results. | 444 * Observations, such as clustering results. |
388 * Embeddings, such as tSNE or UMAPs. | 445 * Embeddings, such as tSNE or UMAPs. |
389 * Unstructure annotations, like gene markers. | 446 * Unstructure annotations, like gene markers. |
390 | 447 |
391 This functionality will probably be added in the future to a larger package. | 448 This functionality will probably be added in the future to a larger package. |
392 | 449 |
393 History | 450 History |
394 ------- | 451 ------- |
452 1.8.1+galaxy10: Adds field to be made unique in obs or var. | |
395 | 453 |
396 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools. | 454 1.6.0+galaxy0: Moves to Scanpy Scripts 0.3.0 (Scanpy 1.6.0), versioning switched to track Scanpy as other tools. |
397 | 455 |
398 0.0.3+galaxy0: Adds ability to merge AnnData objects (Scanpy 1.4.3). | 456 0.0.3+galaxy0: Adds ability to merge AnnData objects (Scanpy 1.4.3). |
399 ]]></help> | 457 ]]></help> |