0
|
1 #!/usr/bin/python3
|
|
2
|
|
3 #Copyright 2019 NUS pathogen genomics
|
|
4 #Written by Cheng Yee Tang (chengyee.tang@nus.edu.sg)
|
|
5 #former python tag !/usr/bin/env python
|
|
6
|
|
7 import os
|
|
8 import sys
|
|
9 import gzip
|
|
10 import argparse
|
|
11 import pandas as pd
|
|
12 import statistics
|
|
13 import subprocess
|
|
14 from statistics import mode
|
|
15 from collections import Counter
|
|
16
|
|
17
|
|
18 #function to determine repeat number based on total number of mismatches in primer sequence
|
|
19 def chooseMode(name, table, CounterList):
|
|
20 maxcount = max(CounterList.values())
|
|
21 repeatToCheck = []
|
|
22 for k, v in CounterList.items():
|
|
23 if v == maxcount:
|
|
24 repeatToCheck.append(k)
|
|
25 x = 0
|
|
26 for i, j in table.items():
|
|
27 if name in i:
|
|
28 x += 1
|
|
29 mismatchDict = {}
|
|
30 for rp in repeatToCheck:
|
|
31 mismatchDict[rp] = 0
|
|
32 for i in range(x):
|
|
33 string = name + '_' + str(i+1)
|
|
34 if table[string][1] in repeatToCheck:
|
|
35 mismatchDict[table[string][1]] += table[string][0]
|
|
36 checklist2 = []
|
|
37 for m, n in mismatchDict.items():
|
|
38 checklist2.append(n)
|
|
39 duplicates = ''
|
|
40 for item in checklist2:
|
|
41 if checklist2.count(item) > 1:
|
|
42 duplicates = 'yes'
|
|
43 finalMode = ''
|
|
44 if duplicates == 'yes':
|
|
45 finalMode = '/'.join(str(r) for min_value in (min(mismatchDict.values()),) for r in mismatchDict if mismatchDict[r]==min_value)
|
|
46 else:
|
|
47 finalMode = min(mismatchDict.keys(), key=(lambda k: mismatchDict[k]))
|
|
48 return finalMode
|
|
49
|
|
50 '''
|
|
51 Main function
|
|
52 '''
|
|
53
|
|
54 script_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
|
|
55 MIRU_table = script_dir + "/MIRU_table"
|
|
56 MIRU_table_0580 = script_dir + "/MIRU_table_0580"
|
|
57 MIRU_primers = script_dir + "/MIRU_primers"
|
|
58
|
|
59 parser = argparse.ArgumentParser()
|
|
60 main_group = parser.add_argument_group('Main options')
|
|
61 main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)')
|
|
62 main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)')
|
|
63 main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table')
|
|
64 main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences')
|
|
65 optional_group = parser.add_argument_group('Optional options')
|
|
66 optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true')
|
|
67 optional_group.add_argument('--details', help='for inspection', action='store_true')
|
|
68 optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true')
|
|
69 args = parser.parse_args()
|
|
70
|
|
71 if not os.path.exists(args.reads):
|
|
72 sys.exit('Error: ' + args.reads + ' is not found!')
|
|
73
|
|
74 sample_prefix = args.prefix
|
|
75 sample_dir = os.path.dirname(os.path.abspath(args.reads))
|
|
76 mismatch_allowed = 18
|
|
77 psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out'
|
|
78
|
|
79 df = pd.read_csv(MIRU_table, sep='\t')
|
|
80 df_0580 = pd.read_csv(MIRU_table_0580, sep='\t')
|
|
81 miru = []
|
|
82 with open(args.primers) as primerFile:
|
|
83 for line in primerFile:
|
|
84 miru.append(line.split()[0])
|
|
85
|
|
86 #auto detect .fastq, .fastq.gz, .fasta, .fasta.gz
|
|
87 #convert fastq to fasta
|
|
88
|
|
89 fastaReads = sample_dir + '/' + sample_prefix + '.fasta'
|
|
90 if not args.amplicons:
|
|
91 if '.fastq' in args.reads:
|
|
92 if '.gz' in args.reads:
|
|
93 tmpH = open(fastaReads, 'w')
|
|
94 p1 = subprocess.Popen(['zcat', args.reads], stdout=subprocess.PIPE)
|
|
95 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p']
|
|
96 subprocess.call(subprocess_args1, stdin=p1.stdout, stdout=tmpH)
|
|
97 tmpH.close()
|
|
98 else:
|
|
99 tmpH = open(fastaReads, 'w')
|
|
100 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p', args.reads]
|
|
101 subprocess.call(subprocess_args1, stdout=tmpH)
|
|
102 tmpH.close()
|
|
103 elif '.fasta' in args.reads:
|
|
104 if '.gz' in args.reads:
|
|
105 with open(fastaReads, 'w') as f:
|
|
106 for line in gzip.open(args.reads, 'rb').readlines():
|
|
107 f.write(line)
|
|
108 else:
|
|
109 fastaReads = args.reads
|
|
110
|
|
111 if not args.amplicons:
|
|
112 try:
|
|
113 subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut]
|
|
114 subprocess.call(subprocess_args)
|
|
115 except OSError:
|
|
116 print('OSError: primersearch command is not found.')
|
|
117 sys.exit()
|
|
118
|
|
119 if not os.path.exists(psearchOut):
|
|
120 sys.exit('Error: ' + psearchOut + ' is not found!')
|
|
121
|
|
122 lookup = {}
|
|
123 repeats = {}
|
|
124 with open(psearchOut, 'r') as infile:
|
|
125 for line in infile.read().splitlines():
|
|
126 if line.startswith('Primer'):
|
|
127 col = line.split(' ')
|
|
128 loci = str(col[2])
|
|
129 repeats.setdefault(loci, [])
|
|
130 elif (line.startswith('Amplimer') and len(line) < 12):
|
|
131 col = line.split(' ')
|
|
132 primerID = loci + '_' + str(col[1])
|
|
133 lookup.setdefault(primerID, [])
|
|
134 mm = 0
|
|
135 elif 'mismatches' in line:
|
|
136 mm += int(line.partition('with ')[2].rstrip(' mismatches'))
|
|
137 elif 'Amplimer length' in line:
|
|
138 field = line.split(':')
|
|
139 amplicon = int(field[1].strip(' ').rstrip(' bp'))
|
|
140 lookup.setdefault(primerID).append(mm)
|
|
141 if amplicon > 1828:
|
|
142 lookup.setdefault(primerID).append('NA')
|
|
143 elif loci == '0580':
|
|
144 if amplicon > df_0580[loci][25]:
|
|
145 lookup.setdefault(primerID).append('NA')
|
|
146 else:
|
|
147 for i in range(26):
|
|
148 if amplicon < df_0580[loci][i]:
|
|
149 if i != 0:
|
|
150 first = df_0580[loci][i-1]
|
|
151 second = df_0580[loci][i]
|
|
152 if abs(amplicon - first) > abs(amplicon - second):
|
|
153 repeats.setdefault(loci).append(df_0580['No.'][i])
|
|
154 lookup.setdefault(primerID).append(df_0580['No.'][i])
|
|
155 break
|
|
156 else:
|
|
157 repeats.setdefault(loci).append(df_0580['No.'][i-1])
|
|
158 lookup.setdefault(primerID).append(df_0580['No.'][i-1])
|
|
159 break
|
|
160 else:
|
|
161 repeats.setdefault(loci).append(0)
|
|
162 lookup.setdefault(primerID).append(0)
|
|
163
|
|
164 else:
|
|
165 if amplicon > df[loci][15]:
|
|
166 lookup.setdefault(primerID).append('NA')
|
|
167 else:
|
|
168 for i in range(16):
|
|
169 if amplicon < df[loci][i]:
|
|
170 if i != 0:
|
|
171 first = df[loci][i-1]
|
|
172 second = df[loci][i]
|
|
173 if abs(amplicon - first) > abs(amplicon - second):
|
|
174 repeats.setdefault(loci).append(i)
|
|
175 lookup.setdefault(primerID).append(i)
|
|
176 break
|
|
177 else:
|
|
178 repeats.setdefault(loci).append(i-1)
|
|
179 lookup.setdefault(primerID).append(i-1)
|
|
180 break
|
|
181 else:
|
|
182 repeats.setdefault(loci).append(0)
|
|
183 lookup.setdefault(primerID).append(0)
|
|
184
|
|
185 if args.details:
|
|
186 myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"])
|
|
187 for key, value in lookup.items():
|
|
188 #example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number
|
|
189 myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True)
|
|
190 sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"])
|
|
191 print(sortedLookUp.to_csv(sep='\t', index=False))
|
|
192 for item in miru:
|
|
193 #array that used to determine repeat number
|
|
194 print(Counter(repeats[item]))
|
|
195
|
|
196 miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1))
|
|
197 miru_repeats['sample_prefix'] = sample_prefix
|
|
198 for item in miru:
|
|
199 if repeats[item] != []:
|
|
200 try:
|
|
201 repeat = mode(repeats[item])
|
|
202 miru_repeats[item][0] = repeat
|
|
203 except statistics.StatisticsError:
|
|
204 repeat = chooseMode(item, lookup, Counter(repeats[item]))
|
|
205 miru_repeats[item][0] = repeat
|
|
206 else:
|
|
207 miru_repeats[item][0] = "ND"
|
|
208
|
|
209 if args.nofasta:
|
|
210 if ('.fastq' in args.reads) or ('.gz' in args.reads):
|
|
211 os.remove(fastaReads)
|
|
212
|
|
213 print(miru_repeats.to_csv(sep='\t', index=False, header=True))
|