Mercurial > repos > ebi-gxa > salmon_kallisto_mtx_to_10x
annotate salmonKallistoMtxTo10x.py @ 2:40f7a3d18cf4 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 3fbbd818012005da4513271beda50df98d3c1cde-dirty
author | ebi-gxa |
---|---|
date | Fri, 08 Nov 2019 08:32:04 -0500 |
parents | fe0fd27aba50 |
children | 60fa6080f86f |
rev | line source |
---|---|
0
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
1 #!/usr/bin/env python |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
2 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
3 # Alevin and Kallisto currently output MTX files and gene labels in a manner |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
4 # inconsistent with the old-style 10X conventions. In both cases the matrix |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
5 # must be transposed, and gene indentifier columns duplicated |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
6 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
7 from __future__ import print_function |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
8 from collections import defaultdict |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
9 from struct import Struct |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
10 import pandas as pd |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
11 import gzip |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
12 import sys |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
13 import os |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
14 from scipy.io import mmread,mmwrite |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
15 from scipy.sparse import * |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
16 from shutil import copyfile |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
17 import pathlib |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
18 import numpy as np |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
19 import argparse |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
20 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
21 parser = argparse.ArgumentParser(description='Convert Alevin or Kallisto MTX outputs to 10X .mtx.') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
22 parser.add_argument('mtx', help = 'MTX-format matrix file') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
23 parser.add_argument('genes', help = 'Gene names text file') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
24 parser.add_argument('barcodes', help = 'Barcodes file') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
25 parser.add_argument('mtx_out', help = 'Output directory for converted results') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
26 parser.add_argument('--cell_prefix', dest='cell_prefix', default='', help = 'Prefix to apply to cell barcodes') |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
27 args = parser.parse_args() |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
28 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
29 quant_file=args.mtx |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
30 cb_file=args.barcodes |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
31 gene_file=args.genes |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
32 mtx_out=args.mtx_out |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
33 cell_prefix=args.cell_prefix |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
34 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
35 if not os.path.exists(quant_file): |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
36 print("quant file {} doesn't exist".format( quant_file )) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
37 sys.exit(1) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
38 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
39 if not os.path.exists(cb_file): |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
40 print("cell barcodes file: {} doesn't exist".format( cb_file )) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
41 sys.exit(1) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
42 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
43 if not os.path.exists(gene_file): |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
44 print("genes file: {} doesn't exist".format( gene_file)) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
45 sys.exit(1) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
46 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
47 # Read gene and cell labels, apply cell prefix |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
48 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
49 cb_names = [cell_prefix + s for s in pd.read_csv(cb_file, header=None)[0].values] |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
50 gene_names = pd.read_csv(gene_file, header=None)[0].values |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
51 umi_counts = mmread( quant_file ) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
52 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
53 # Write outputs to a .mtx file readable by tools expecting 10X outputs. |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
54 # Barcodes file works as-is, genes need to be two-column, duplicating the |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
55 # identifiers. Matrix itself needs to have genes by row, so we transpose. |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
56 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
57 pathlib.Path(mtx_out).mkdir(parents=True, exist_ok=True) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
58 mmwrite('%s/matrix.mtx' % mtx_out, umi_counts.transpose()) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
59 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
60 genes_frame = pd.DataFrame([ gene_names, gene_names]).transpose() |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
61 genes_frame.to_csv(path_or_buf='%s/genes.tsv' % mtx_out, index=False, sep="\t", header = False) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
62 |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
63 with open('%s/barcodes.tsv' % mtx_out, 'w') as f: |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
64 f.write("\n".join(cb_names)) |
fe0fd27aba50
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/salmon-kallisto-mtx-to-10x/.shed.yml commit 023431ca119829efbde33c94d54e051fac24a1d5
ebi-gxa
parents:
diff
changeset
|
65 f.write("\n") |