Mercurial > repos > iuc > concoct_coverage_table
comparison extract_fasta_bins.py @ 0:7e01297a3b4a draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
| author | iuc |
|---|---|
| date | Sun, 13 Mar 2022 08:45:32 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7e01297a3b4a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import gzip | |
| 5 import os | |
| 6 import sys | |
| 7 from collections import defaultdict | |
| 8 from functools import partial | |
| 9 | |
| 10 import pandas as pd | |
| 11 from Bio import SeqIO | |
| 12 | |
| 13 parser = argparse.ArgumentParser(description=__doc__) | |
| 14 | |
| 15 parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped') | |
| 16 parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Input Fasta file") | |
| 17 parser.add_argument("--input_cluster", action="store", dest="input_cluster", help="Concoct output cluster file") | |
| 18 parser.add_argument("--output_path", help="Output directory") | |
| 19 | |
| 20 args = parser.parse_args() | |
| 21 | |
| 22 all_seqs = {} | |
| 23 if args.gzipped: | |
| 24 _open = partial(gzip.open, mode='rt') | |
| 25 else: | |
| 26 _open = open | |
| 27 | |
| 28 with _open(args.input_fasta) as fh: | |
| 29 for seq in SeqIO.parse(fh, "fasta"): | |
| 30 all_seqs[seq.id] = seq | |
| 31 | |
| 32 # Make sure we're reading the file as tabular! | |
| 33 df = pd.read_csv(args.input_cluster, sep='\t') | |
| 34 try: | |
| 35 assert df.columns[0] == 'contig_id' | |
| 36 assert df.columns[1] == 'cluster_id' | |
| 37 except AssertionError: | |
| 38 sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n") | |
| 39 sys.exit(-1) | |
| 40 | |
| 41 cluster_to_contigs = defaultdict(list) | |
| 42 for i, row in df.iterrows(): | |
| 43 cluster_to_contigs[row['cluster_id']].append(row['contig_id']) | |
| 44 | |
| 45 for cluster_id, contig_ids in cluster_to_contigs.items(): | |
| 46 output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id)) | |
| 47 seqs = [all_seqs[contig_id] for contig_id in contig_ids] | |
| 48 with open(output_file, 'w') as ofh: | |
| 49 SeqIO.write(seqs, ofh, 'fasta') |
