Mercurial > repos > dcouvin > mirureader
comparison MIRUReader/MIRUReader.py @ 0:f0e3646a4e45 draft
Uploaded
author | dcouvin |
---|---|
date | Tue, 17 Aug 2021 19:15:15 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f0e3646a4e45 |
---|---|
1 #!/usr/bin/python3 | |
2 | |
3 #Copyright 2019 NUS pathogen genomics | |
4 #Written by Cheng Yee Tang (chengyee.tang@nus.edu.sg) | |
5 #former python tag !/usr/bin/env python | |
6 | |
7 import os | |
8 import sys | |
9 import gzip | |
10 import argparse | |
11 import pandas as pd | |
12 import statistics | |
13 import subprocess | |
14 from statistics import mode | |
15 from collections import Counter | |
16 | |
17 | |
18 #function to determine repeat number based on total number of mismatches in primer sequence | |
19 def chooseMode(name, table, CounterList): | |
20 maxcount = max(CounterList.values()) | |
21 repeatToCheck = [] | |
22 for k, v in CounterList.items(): | |
23 if v == maxcount: | |
24 repeatToCheck.append(k) | |
25 x = 0 | |
26 for i, j in table.items(): | |
27 if name in i: | |
28 x += 1 | |
29 mismatchDict = {} | |
30 for rp in repeatToCheck: | |
31 mismatchDict[rp] = 0 | |
32 for i in range(x): | |
33 string = name + '_' + str(i+1) | |
34 if table[string][1] in repeatToCheck: | |
35 mismatchDict[table[string][1]] += table[string][0] | |
36 checklist2 = [] | |
37 for m, n in mismatchDict.items(): | |
38 checklist2.append(n) | |
39 duplicates = '' | |
40 for item in checklist2: | |
41 if checklist2.count(item) > 1: | |
42 duplicates = 'yes' | |
43 finalMode = '' | |
44 if duplicates == 'yes': | |
45 finalMode = '/'.join(str(r) for min_value in (min(mismatchDict.values()),) for r in mismatchDict if mismatchDict[r]==min_value) | |
46 else: | |
47 finalMode = min(mismatchDict.keys(), key=(lambda k: mismatchDict[k])) | |
48 return finalMode | |
49 | |
50 ''' | |
51 Main function | |
52 ''' | |
53 | |
54 script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) | |
55 MIRU_table = script_dir + "/MIRU_table" | |
56 MIRU_table_0580 = script_dir + "/MIRU_table_0580" | |
57 MIRU_primers = script_dir + "/MIRU_primers" | |
58 | |
59 parser = argparse.ArgumentParser() | |
60 main_group = parser.add_argument_group('Main options') | |
61 main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)') | |
62 main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)') | |
63 main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table') | |
64 main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences') | |
65 optional_group = parser.add_argument_group('Optional options') | |
66 optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true') | |
67 optional_group.add_argument('--details', help='for inspection', action='store_true') | |
68 optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true') | |
69 args = parser.parse_args() | |
70 | |
71 if not os.path.exists(args.reads): | |
72 sys.exit('Error: ' + args.reads + ' is not found!') | |
73 | |
74 sample_prefix = args.prefix | |
75 sample_dir = os.path.dirname(os.path.abspath(args.reads)) | |
76 mismatch_allowed = 18 | |
77 psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out' | |
78 | |
79 df = pd.read_csv(MIRU_table, sep='\t') | |
80 df_0580 = pd.read_csv(MIRU_table_0580, sep='\t') | |
81 miru = [] | |
82 with open(args.primers) as primerFile: | |
83 for line in primerFile: | |
84 miru.append(line.split()[0]) | |
85 | |
86 #auto detect .fastq, .fastq.gz, .fasta, .fasta.gz | |
87 #convert fastq to fasta | |
88 | |
89 fastaReads = sample_dir + '/' + sample_prefix + '.fasta' | |
90 if not args.amplicons: | |
91 if '.fastq' in args.reads: | |
92 if '.gz' in args.reads: | |
93 tmpH = open(fastaReads, 'w') | |
94 p1 = subprocess.Popen(['zcat', args.reads], stdout=subprocess.PIPE) | |
95 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p'] | |
96 subprocess.call(subprocess_args1, stdin=p1.stdout, stdout=tmpH) | |
97 tmpH.close() | |
98 else: | |
99 tmpH = open(fastaReads, 'w') | |
100 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p', args.reads] | |
101 subprocess.call(subprocess_args1, stdout=tmpH) | |
102 tmpH.close() | |
103 elif '.fasta' in args.reads: | |
104 if '.gz' in args.reads: | |
105 with open(fastaReads, 'w') as f: | |
106 for line in gzip.open(args.reads, 'rb').readlines(): | |
107 f.write(line) | |
108 else: | |
109 fastaReads = args.reads | |
110 | |
111 if not args.amplicons: | |
112 try: | |
113 subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut] | |
114 subprocess.call(subprocess_args) | |
115 except OSError: | |
116 print('OSError: primersearch command is not found.') | |
117 sys.exit() | |
118 | |
119 if not os.path.exists(psearchOut): | |
120 sys.exit('Error: ' + psearchOut + ' is not found!') | |
121 | |
122 lookup = {} | |
123 repeats = {} | |
124 with open(psearchOut, 'r') as infile: | |
125 for line in infile.read().splitlines(): | |
126 if line.startswith('Primer'): | |
127 col = line.split(' ') | |
128 loci = str(col[2]) | |
129 repeats.setdefault(loci, []) | |
130 elif (line.startswith('Amplimer') and len(line) < 12): | |
131 col = line.split(' ') | |
132 primerID = loci + '_' + str(col[1]) | |
133 lookup.setdefault(primerID, []) | |
134 mm = 0 | |
135 elif 'mismatches' in line: | |
136 mm += int(line.partition('with ')[2].rstrip(' mismatches')) | |
137 elif 'Amplimer length' in line: | |
138 field = line.split(':') | |
139 amplicon = int(field[1].strip(' ').rstrip(' bp')) | |
140 lookup.setdefault(primerID).append(mm) | |
141 if amplicon > 1828: | |
142 lookup.setdefault(primerID).append('NA') | |
143 elif loci == '0580': | |
144 if amplicon > df_0580[loci][25]: | |
145 lookup.setdefault(primerID).append('NA') | |
146 else: | |
147 for i in range(26): | |
148 if amplicon < df_0580[loci][i]: | |
149 if i != 0: | |
150 first = df_0580[loci][i-1] | |
151 second = df_0580[loci][i] | |
152 if abs(amplicon - first) > abs(amplicon - second): | |
153 repeats.setdefault(loci).append(df_0580['No.'][i]) | |
154 lookup.setdefault(primerID).append(df_0580['No.'][i]) | |
155 break | |
156 else: | |
157 repeats.setdefault(loci).append(df_0580['No.'][i-1]) | |
158 lookup.setdefault(primerID).append(df_0580['No.'][i-1]) | |
159 break | |
160 else: | |
161 repeats.setdefault(loci).append(0) | |
162 lookup.setdefault(primerID).append(0) | |
163 | |
164 else: | |
165 if amplicon > df[loci][15]: | |
166 lookup.setdefault(primerID).append('NA') | |
167 else: | |
168 for i in range(16): | |
169 if amplicon < df[loci][i]: | |
170 if i != 0: | |
171 first = df[loci][i-1] | |
172 second = df[loci][i] | |
173 if abs(amplicon - first) > abs(amplicon - second): | |
174 repeats.setdefault(loci).append(i) | |
175 lookup.setdefault(primerID).append(i) | |
176 break | |
177 else: | |
178 repeats.setdefault(loci).append(i-1) | |
179 lookup.setdefault(primerID).append(i-1) | |
180 break | |
181 else: | |
182 repeats.setdefault(loci).append(0) | |
183 lookup.setdefault(primerID).append(0) | |
184 | |
185 if args.details: | |
186 myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"]) | |
187 for key, value in lookup.items(): | |
188 #example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number | |
189 myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True) | |
190 sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"]) | |
191 print(sortedLookUp.to_csv(sep='\t', index=False)) | |
192 for item in miru: | |
193 #array that used to determine repeat number | |
194 print(Counter(repeats[item])) | |
195 | |
196 miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1)) | |
197 miru_repeats['sample_prefix'] = sample_prefix | |
198 for item in miru: | |
199 if repeats[item] != []: | |
200 try: | |
201 repeat = mode(repeats[item]) | |
202 miru_repeats[item][0] = repeat | |
203 except statistics.StatisticsError: | |
204 repeat = chooseMode(item, lookup, Counter(repeats[item])) | |
205 miru_repeats[item][0] = repeat | |
206 else: | |
207 miru_repeats[item][0] = "ND" | |
208 | |
209 if args.nofasta: | |
210 if ('.fastq' in args.reads) or ('.gz' in args.reads): | |
211 os.remove(fastaReads) | |
212 | |
213 print(miru_repeats.to_csv(sep='\t', index=False, header=True)) |