Mercurial > repos > petr-novak > re_utils
diff cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft
Uploaded
author | petr-novak |
---|---|
date | Tue, 18 May 2021 11:03:57 +0000 |
parents | d14b68e9fd1d |
children |
line wrap: on
line diff
--- a/cluster_table2krona_format.py Fri May 14 11:08:46 2021 +0000 +++ b/cluster_table2krona_format.py Tue May 18 11:03:57 2021 +0000 @@ -3,6 +3,7 @@ import re from collections import defaultdict import argparse +import csv parser = argparse.ArgumentParser() parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") @@ -12,26 +13,35 @@ args = parser.parse_args() column = 6 if args.use_manual else 4 - +if args.use_manual: + annotation="Final_annotation" +else: + annotation="Automatic_annotation" header = False clust_info = {} counts = defaultdict(lambda: 0) top_clusters = 0 with open(args.input.name, 'r') as f: - for l in f: - parts = l.split() - if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): - print("header detected") + csv_reader = csv.reader(f, delimiter = "\t") + for parts in csv_reader: + if len(parts) == 0: + continue + if parts[0] == "Cluster" and parts[1]== "Supercluster": header = True + header_columns = parts + column = header_columns.index(annotation) continue if header: classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') counts[classification] += int(parts[3]) top_clusters += int(parts[3]) + elif len(parts) >= 2: + try: + clust_info[parts[0].replace('"', '')] = int(parts[1]) + except ValueError: + pass - elif len(parts) >= 2: - clust_info[parts[0].replace('"', '')] = int(parts[1]) counts['Singlets'] = clust_info['Number_of_singlets'] counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters