Mercurial > repos > dcouvin > mirureader
diff MIRUReader/MIRUReader.py @ 0:f0e3646a4e45 draft
Uploaded
author | dcouvin |
---|---|
date | Tue, 17 Aug 2021 19:15:15 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MIRUReader/MIRUReader.py Tue Aug 17 19:15:15 2021 +0000 @@ -0,0 +1,213 @@ +#!/usr/bin/python3 + +#Copyright 2019 NUS pathogen genomics +#Written by Cheng Yee Tang (chengyee.tang@nus.edu.sg) +#former python tag !/usr/bin/env python + +import os +import sys +import gzip +import argparse +import pandas as pd +import statistics +import subprocess +from statistics import mode +from collections import Counter + + +#function to determine repeat number based on total number of mismatches in primer sequence +def chooseMode(name, table, CounterList): + maxcount = max(CounterList.values()) + repeatToCheck = [] + for k, v in CounterList.items(): + if v == maxcount: + repeatToCheck.append(k) + x = 0 + for i, j in table.items(): + if name in i: + x += 1 + mismatchDict = {} + for rp in repeatToCheck: + mismatchDict[rp] = 0 + for i in range(x): + string = name + '_' + str(i+1) + if table[string][1] in repeatToCheck: + mismatchDict[table[string][1]] += table[string][0] + checklist2 = [] + for m, n in mismatchDict.items(): + checklist2.append(n) + duplicates = '' + for item in checklist2: + if checklist2.count(item) > 1: + duplicates = 'yes' + finalMode = '' + if duplicates == 'yes': + finalMode = '/'.join(str(r) for min_value in (min(mismatchDict.values()),) for r in mismatchDict if mismatchDict[r]==min_value) + else: + finalMode = min(mismatchDict.keys(), key=(lambda k: mismatchDict[k])) + return finalMode + +''' +Main function +''' + +script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) +MIRU_table = script_dir + "/MIRU_table" +MIRU_table_0580 = script_dir + "/MIRU_table_0580" +MIRU_primers = script_dir + "/MIRU_primers" + +parser = argparse.ArgumentParser() +main_group = parser.add_argument_group('Main options') +main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)') +main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)') +main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table') +main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences') +optional_group = parser.add_argument_group('Optional options') +optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true') +optional_group.add_argument('--details', help='for inspection', action='store_true') +optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true') +args = parser.parse_args() + +if not os.path.exists(args.reads): + sys.exit('Error: ' + args.reads + ' is not found!') + +sample_prefix = args.prefix +sample_dir = os.path.dirname(os.path.abspath(args.reads)) +mismatch_allowed = 18 +psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out' + +df = pd.read_csv(MIRU_table, sep='\t') +df_0580 = pd.read_csv(MIRU_table_0580, sep='\t') +miru = [] +with open(args.primers) as primerFile: + for line in primerFile: + miru.append(line.split()[0]) + +#auto detect .fastq, .fastq.gz, .fasta, .fasta.gz +#convert fastq to fasta + +fastaReads = sample_dir + '/' + sample_prefix + '.fasta' +if not args.amplicons: + if '.fastq' in args.reads: + if '.gz' in args.reads: + tmpH = open(fastaReads, 'w') + p1 = subprocess.Popen(['zcat', args.reads], stdout=subprocess.PIPE) + subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p'] + subprocess.call(subprocess_args1, stdin=p1.stdout, stdout=tmpH) + tmpH.close() + else: + tmpH = open(fastaReads, 'w') + subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p', args.reads] + subprocess.call(subprocess_args1, stdout=tmpH) + tmpH.close() + elif '.fasta' in args.reads: + if '.gz' in args.reads: + with open(fastaReads, 'w') as f: + for line in gzip.open(args.reads, 'rb').readlines(): + f.write(line) + else: + fastaReads = args.reads + +if not args.amplicons: + try: + subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut] + subprocess.call(subprocess_args) + except OSError: + print('OSError: primersearch command is not found.') + sys.exit() + +if not os.path.exists(psearchOut): + sys.exit('Error: ' + psearchOut + ' is not found!') + +lookup = {} +repeats = {} +with open(psearchOut, 'r') as infile: + for line in infile.read().splitlines(): + if line.startswith('Primer'): + col = line.split(' ') + loci = str(col[2]) + repeats.setdefault(loci, []) + elif (line.startswith('Amplimer') and len(line) < 12): + col = line.split(' ') + primerID = loci + '_' + str(col[1]) + lookup.setdefault(primerID, []) + mm = 0 + elif 'mismatches' in line: + mm += int(line.partition('with ')[2].rstrip(' mismatches')) + elif 'Amplimer length' in line: + field = line.split(':') + amplicon = int(field[1].strip(' ').rstrip(' bp')) + lookup.setdefault(primerID).append(mm) + if amplicon > 1828: + lookup.setdefault(primerID).append('NA') + elif loci == '0580': + if amplicon > df_0580[loci][25]: + lookup.setdefault(primerID).append('NA') + else: + for i in range(26): + if amplicon < df_0580[loci][i]: + if i != 0: + first = df_0580[loci][i-1] + second = df_0580[loci][i] + if abs(amplicon - first) > abs(amplicon - second): + repeats.setdefault(loci).append(df_0580['No.'][i]) + lookup.setdefault(primerID).append(df_0580['No.'][i]) + break + else: + repeats.setdefault(loci).append(df_0580['No.'][i-1]) + lookup.setdefault(primerID).append(df_0580['No.'][i-1]) + break + else: + repeats.setdefault(loci).append(0) + lookup.setdefault(primerID).append(0) + + else: + if amplicon > df[loci][15]: + lookup.setdefault(primerID).append('NA') + else: + for i in range(16): + if amplicon < df[loci][i]: + if i != 0: + first = df[loci][i-1] + second = df[loci][i] + if abs(amplicon - first) > abs(amplicon - second): + repeats.setdefault(loci).append(i) + lookup.setdefault(primerID).append(i) + break + else: + repeats.setdefault(loci).append(i-1) + lookup.setdefault(primerID).append(i-1) + break + else: + repeats.setdefault(loci).append(0) + lookup.setdefault(primerID).append(0) + +if args.details: + myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"]) + for key, value in lookup.items(): + #example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number + myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True) + sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"]) + print(sortedLookUp.to_csv(sep='\t', index=False)) + for item in miru: + #array that used to determine repeat number + print(Counter(repeats[item])) + +miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1)) +miru_repeats['sample_prefix'] = sample_prefix +for item in miru: + if repeats[item] != []: + try: + repeat = mode(repeats[item]) + miru_repeats[item][0] = repeat + except statistics.StatisticsError: + repeat = chooseMode(item, lookup, Counter(repeats[item])) + miru_repeats[item][0] = repeat + else: + miru_repeats[item][0] = "ND" + +if args.nofasta: + if ('.fastq' in args.reads) or ('.gz' in args.reads): + os.remove(fastaReads) + +print(miru_repeats.to_csv(sep='\t', index=False, header=True))