Mercurial > repos > petr-novak > re_utils
comparison cluster_table2krona_format.py @ 19:2f1b5d5c5dd5 draft
Uploaded
author | petr-novak |
---|---|
date | Tue, 18 May 2021 11:03:57 +0000 |
parents | d14b68e9fd1d |
children |
comparison
equal
deleted
inserted
replaced
18:d7f3eff34c27 | 19:2f1b5d5c5dd5 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 import sys | 2 import sys |
3 import re | 3 import re |
4 from collections import defaultdict | 4 from collections import defaultdict |
5 import argparse | 5 import argparse |
6 import csv | |
6 | 7 |
7 parser = argparse.ArgumentParser() | 8 parser = argparse.ArgumentParser() |
8 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") | 9 parser.add_argument("-i" ,"--input", type=argparse.FileType('r'), help="path to file CLUSTER_table.csv") |
9 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name") | 10 parser.add_argument("-o" ,"--output", type=argparse.FileType('w'), help="output file name") |
10 parser.add_argument("-m", "--use_manual", action='store_true', default=False) | 11 parser.add_argument("-m", "--use_manual", action='store_true', default=False) |
11 | 12 |
12 args = parser.parse_args() | 13 args = parser.parse_args() |
13 | 14 |
14 column = 6 if args.use_manual else 4 | 15 column = 6 if args.use_manual else 4 |
15 | 16 if args.use_manual: |
17 annotation="Final_annotation" | |
18 else: | |
19 annotation="Automatic_annotation" | |
16 | 20 |
17 header = False | 21 header = False |
18 clust_info = {} | 22 clust_info = {} |
19 counts = defaultdict(lambda: 0) | 23 counts = defaultdict(lambda: 0) |
20 top_clusters = 0 | 24 top_clusters = 0 |
21 with open(args.input.name, 'r') as f: | 25 with open(args.input.name, 'r') as f: |
22 for l in f: | 26 csv_reader = csv.reader(f, delimiter = "\t") |
23 parts = l.split() | 27 for parts in csv_reader: |
24 if re.match('.*Cluster.+Supercluster.+Size.+Size_adjusted.+Automatic_annotation.+TAREAN_annotation.+Final_annotation', l): | 28 if len(parts) == 0: |
25 print("header detected") | 29 continue |
30 if parts[0] == "Cluster" and parts[1]== "Supercluster": | |
26 header = True | 31 header = True |
32 header_columns = parts | |
33 column = header_columns.index(annotation) | |
27 continue | 34 continue |
28 if header: | 35 if header: |
29 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') | 36 classification = "Top_clusters\t" + "\t".join(parts[column].split("/")[1:]).replace('"','') |
30 counts[classification] += int(parts[3]) | 37 counts[classification] += int(parts[3]) |
31 top_clusters += int(parts[3]) | 38 top_clusters += int(parts[3]) |
39 elif len(parts) >= 2: | |
40 try: | |
41 clust_info[parts[0].replace('"', '')] = int(parts[1]) | |
42 except ValueError: | |
43 pass | |
32 | 44 |
33 elif len(parts) >= 2: | |
34 clust_info[parts[0].replace('"', '')] = int(parts[1]) | |
35 | 45 |
36 counts['Singlets'] = clust_info['Number_of_singlets'] | 46 counts['Singlets'] = clust_info['Number_of_singlets'] |
37 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters | 47 counts['Small_cluster'] = int(clust_info['Number_of_reads_in_clusters']) - top_clusters |
38 | 48 |
39 with open(args.output.name, 'w') as fout: | 49 with open(args.output.name, 'w') as fout: |