Mercurial > repos > iuc > concoct
annotate extract_fasta_bins.py @ 0:06c0eb033025 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
author | iuc |
---|---|
date | Fri, 18 Feb 2022 14:18:11 +0000 |
parents | |
children |
rev | line source |
---|---|
0
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
2 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
3 import argparse |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
4 import gzip |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
5 import os |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
6 import sys |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
7 from collections import defaultdict |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
8 from functools import partial |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
9 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
10 import pandas as pd |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
11 from Bio import SeqIO |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
12 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
13 parser = argparse.ArgumentParser(description=__doc__) |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
14 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
15 parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped') |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
16 parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Input Fasta file") |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
17 parser.add_argument("--input_cluster", action="store", dest="input_cluster", help="Concoct output cluster file") |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
18 parser.add_argument("--output_path", help="Output directory") |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
19 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
20 args = parser.parse_args() |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
21 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
22 all_seqs = {} |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
23 if args.gzipped: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
24 _open = partial(gzip.open, mode='rt') |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
25 else: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
26 _open = open |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
27 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
28 with _open(args.input_fasta) as fh: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
29 for seq in SeqIO.parse(fh, "fasta"): |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
30 all_seqs[seq.id] = seq |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
31 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
32 # Make sure we're reading the file as tabular! |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
33 df = pd.read_csv(args.input_cluster, sep='\t') |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
34 try: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
35 assert df.columns[0] == 'contig_id' |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
36 assert df.columns[1] == 'cluster_id' |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
37 except AssertionError: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
38 sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n") |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
39 sys.exit(-1) |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
40 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
41 cluster_to_contigs = defaultdict(list) |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
42 for i, row in df.iterrows(): |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
43 cluster_to_contigs[row['cluster_id']].append(row['contig_id']) |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
44 |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
45 for cluster_id, contig_ids in cluster_to_contigs.items(): |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
46 output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id)) |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
47 seqs = [all_seqs[contig_id] for contig_id in contig_ids] |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
48 with open(output_file, 'w') as ofh: |
06c0eb033025
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 49b42f61ff37c3c33dd15c195e5705e1db066c37"
iuc
parents:
diff
changeset
|
49 SeqIO.write(seqs, ofh, 'fasta') |