comparison fasta-stats.py @ 4:0dbb995c7d35 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
author iuc
date Thu, 18 Nov 2021 20:56:57 +0000
parents
children
comparison
equal deleted inserted replaced
3:56022eb50bbd 4:0dbb995c7d35
1 #!/usr/bin/env python
2
3
4 # python version of fasta-stats with some extra features
5 # written by anmol.kiran@gmail.com
6 # git: @codemeleon
7 # date: 10/11/2021
8
9 import argparse
10 import re
11 from os import path
12
13 import numpy as np
14 from Bio import SeqIO
15
16
17 def calculate_NG50(estimated_genome, total_length, sequence_lengths):
18 temp = 0
19 teoretical_NG50 = estimated_genome / 2.0
20 NG50 = 0
21 for seq in sequence_lengths:
22 temp += seq
23 if teoretical_NG50 < temp:
24 NG50 = seq
25 break
26 return NG50
27
28
29 def run(fasta, stats_output, gaps_output, genome_size):
30 """Generates scaffold statistics."""
31 if not fasta:
32 exit("Input file not given.")
33 if not path.isfile(fasta):
34 exit(f"{fasta} path does not exist.")
35
36 seq_len = {}
37 bases_global = {"A": 0, "N": 0, "T": 0, "C": 0, "G": 0}
38 bases_seq = {}
39 seq_id_Ngaprange = {}
40 nstart = 0
41 contigs_len = []
42 gap_count = 0
43 for seq_record in SeqIO.parse(fasta, "fasta"):
44 seq = str(seq_record.seq).upper()
45 # print(len(seq))
46 seq_len[seq_record.id] = len(seq)
47
48 # NOTE: Nucleotide count
49 bases_seq[seq_record.id] = {
50 "A": seq.count("A"),
51 "N": seq.count("N"),
52 "T": seq.count("T"),
53 "C": seq.count("C"),
54 "G": seq.count("G"),
55 }
56 bases_global["A"] += bases_seq[seq_record.id]["A"]
57 bases_global["N"] += bases_seq[seq_record.id]["N"]
58 bases_global["T"] += bases_seq[seq_record.id]["T"]
59 bases_global["C"] += bases_seq[seq_record.id]["C"]
60 bases_global["G"] += bases_seq[seq_record.id]["G"]
61
62 # NOTE: Gap count and their range
63 range_gen = re.finditer("N+", seq)
64 n_range = [match.span() for match in range_gen]
65 for n_rng in n_range:
66 if n_rng[0] == 0 or n_rng[1] == seq_len[seq_record.id]:
67 continue
68 else:
69 gap_count += 1
70
71 # NOTE: Contigs, their lenths from scaffold and their N gap range
72 seq_id_Ngaprange[seq_record.id] = n_range
73 n_range_len = len(n_range)
74 if n_range_len > 0:
75 n_range = (
76 [(0, 0)] + n_range + [(seq_len[seq_record.id], seq_len[seq_record.id])]
77 )
78 for idx in range(n_range_len + 1):
79 nstart = n_range[idx][1]
80 nend = n_range[idx + 1][0]
81 con_len = nend - nstart
82 if con_len:
83 contigs_len.append(con_len)
84 else:
85 contigs_len.append(len(seq))
86
87 # NOTE: Scaffold statistics
88 SEQ_LEN_LIST = sorted(seq_len.values(), reverse=True)
89 scaffold_lens = np.array(SEQ_LEN_LIST)
90 scaffold_lens_sum = np.cumsum(scaffold_lens)
91 N50_len = scaffold_lens_sum[-1] * 0.5
92 N50_idx = np.where(scaffold_lens_sum > N50_len)[0][0]
93 N90_len = scaffold_lens_sum[-1] * 0.9
94 N90_idx = np.where(scaffold_lens_sum > N90_len)[0][0]
95 NG50 = calculate_NG50(genome_size, scaffold_lens_sum[-1], scaffold_lens)
96
97 # NOTE: Contig statistics
98 seq_len_list = sorted(contigs_len, reverse=True)
99 contigs_len = np.array(seq_len_list)
100 contigs_len_sum = np.cumsum(contigs_len)
101 n50_len = contigs_len_sum[-1] * 0.5
102 n50_idx = np.where(contigs_len_sum > n50_len)[0][0]
103 n90_len = contigs_len_sum[-1] * 0.9
104 n90_idx = np.where(contigs_len_sum > n90_len)[0][0]
105 ng50 = calculate_NG50(genome_size, contigs_len_sum[-1], contigs_len)
106
107 with open(stats_output, "w") as soutput:
108 soutput.write("{}\t{}\n".format("Scaffold L50", N50_idx + 1))
109 soutput.write("{}\t{}\n".format("Scaffold N50", SEQ_LEN_LIST[N50_idx]))
110 soutput.write("{}\t{}\n".format("Scaffold L90", N90_idx + 1))
111 soutput.write("{}\t{}\n".format("Scaffold N90", SEQ_LEN_LIST[N90_idx]))
112 if genome_size != 0:
113 soutput.write("{}\t{}\n".format("Scaffold NG50", NG50))
114 soutput.write("{}\t{}\n".format("Scaffold len_max", SEQ_LEN_LIST[0]))
115 soutput.write("{}\t{}\n".format("Scaffold len_min", SEQ_LEN_LIST[-1]))
116 soutput.write(
117 "{}\t{}\n".format("Scaffold len_mean", int(np.mean(SEQ_LEN_LIST)))
118 )
119 soutput.write(
120 "{}\t{}\n".format("Scaffold len_median", int(np.median(SEQ_LEN_LIST)))
121 )
122 soutput.write("{}\t{}\n".format("Scaffold len_std", int(np.std(SEQ_LEN_LIST))))
123 soutput.write("{}\t{}\n".format("Scaffold num_A", bases_global["A"]))
124 soutput.write("{}\t{}\n".format("Scaffold num_T", bases_global["T"]))
125 soutput.write("{}\t{}\n".format("Scaffold num_C", bases_global["C"]))
126 soutput.write("{}\t{}\n".format("Scaffold num_G", bases_global["G"]))
127 soutput.write("{}\t{}\n".format("Scaffold num_N", bases_global["N"]))
128 soutput.write("{}\t{}\n".format("Scaffold num_bp", scaffold_lens_sum[-1]))
129 soutput.write(
130 "{}\t{}\n".format(
131 "Scaffold num_bp_not_N", scaffold_lens_sum[-1] - bases_global["N"]
132 )
133 )
134 soutput.write("{}\t{}\n".format("Scaffold num_seq", len(SEQ_LEN_LIST)))
135 soutput.write(
136 "{}\t{:.2f}\n".format(
137 "Scaffold GC content overall",
138 (
139 (bases_global["G"] + bases_global["C"])
140 * 100.0
141 / scaffold_lens_sum[-1]
142 ),
143 )
144 )
145
146 soutput.write("{}\t{}\n".format("Contig L50", n50_idx + 1))
147 soutput.write("{}\t{}\n".format("Contig N50", seq_len_list[n50_idx]))
148 soutput.write("{}\t{}\n".format("Contig L90", n90_idx + 1))
149 soutput.write("{}\t{}\n".format("Contig N90", seq_len_list[n90_idx]))
150 if genome_size != 0:
151 soutput.write("{}\t{}\n".format("Contig NG50", ng50))
152 soutput.write("{}\t{}\n".format("Contig len_max", seq_len_list[0]))
153 soutput.write("{}\t{}\n".format("Contig len_min", seq_len_list[-1]))
154 soutput.write("{}\t{}\n".format("Contig len_mean", int(np.mean(seq_len_list))))
155 soutput.write(
156 "{}\t{}\n".format("Contig len_median", int(np.median(seq_len_list)))
157 )
158 soutput.write("{}\t{}\n".format("Contig len_std", int(np.std(seq_len_list))))
159 soutput.write("{}\t{}\n".format("Contig num_bp", contigs_len_sum[-1]))
160 soutput.write("{}\t{}\n".format("Contig num_seq", len(contigs_len_sum)))
161 soutput.write("{}\t{}\n".format("Number of gaps", gap_count))
162 if gaps_output is not None:
163 # NOTE: generate gaps statistics file
164 with open(gaps_output, "w") as goutput:
165 for key in seq_id_Ngaprange:
166 for rng in seq_id_Ngaprange[key]:
167 goutput.write("{}\t{}\t{}\n".format(key, rng[0], rng[1]))
168
169
170 if __name__ == "__main__":
171 parser = argparse.ArgumentParser()
172 parser.add_argument("-f", "--fasta", required=True, help="FASTA file")
173 parser.add_argument(
174 "-z",
175 "--genome_size",
176 required=False,
177 type=int,
178 help="If provided, the NG50 statistic will be computed",
179 default=0,
180 )
181 parser.add_argument(
182 "-s",
183 "--stats_output",
184 required=True,
185 help="File to store the general statistics",
186 )
187 parser.add_argument(
188 "-r",
189 "--gaps_output",
190 required=False,
191 help="File to store the gaps statistics",
192 default=None,
193 )
194 args = parser.parse_args()
195
196 run(
197 args.fasta,
198 args.stats_output,
199 args.gaps_output,
200 args.genome_size,
201 )