1
|
1 #1: Modules
|
|
2 import re
|
|
3 import argparse
|
|
4 import numpy as np
|
|
5 np.set_printoptions(threshold=np.inf)
|
|
6
|
|
7 parser = argparse.ArgumentParser()
|
|
8 parser.add_argument("-m", "--minimum",type=float, help="cutoff value")
|
|
9 parser.add_argument("-c", "--codon",type=str, help="codons of interest")
|
|
10 parser.add_argument("-i", "--input_sequences", type=str, help="")
|
|
11 parser.add_argument("-r", "--result", type=str)
|
|
12
|
|
13 args = parser.parse_args()
|
|
14
|
|
15 sequences = args.input_sequences
|
|
16 cutoff = args.minimum
|
|
17 codonlist=args.codon
|
|
18 resultfile=args.result
|
|
19
|
|
20 codonlist = codonlist.split(",")
|
|
21
|
|
22 i=0
|
|
23 codon_dictionary = {}
|
|
24 while i < len(codonlist):
|
|
25 codon_dictionary[codonlist[i]] = 0
|
|
26 i+=1
|
|
27 #2:input
|
|
28 dict1 = {}
|
|
29 dict2 = {}
|
|
30 seq_id =""
|
|
31
|
|
32 i=0
|
|
33 input_sequences = open(sequences,mode = 'r')
|
|
34 for line in input_sequences:
|
|
35 if line [0] == ">":
|
|
36 match = re.search(">(\S+)\s(.*)$",line)
|
|
37 seq_id = match.group(1)
|
|
38 dict1[seq_id] = ""
|
|
39 elif seq_id != "" and line.strip()!= "":
|
|
40 dict1[seq_id] += line.strip()
|
|
41
|
|
42 i=0
|
|
43 for name in dict1.keys():
|
|
44 dict2[i] = name
|
|
45 i+=1
|
|
46
|
|
47 testsequences=int(len(dict2))
|
|
48
|
|
49 #3: Codon-Scoring of the sequences
|
|
50 i=0
|
|
51 m=1
|
|
52 testseq=""
|
|
53
|
|
54 scoremat=np.zeros((len(dict2)+1,len(codon_dictionary)+2)).astype(object)
|
|
55
|
|
56 for i in range(len(dict2)):
|
|
57
|
|
58 l=0
|
|
59 scorecomp=0
|
|
60 codonscores=[]
|
|
61 testseq=dict1[dict2[i]]
|
|
62 codon_dictionary=dict.fromkeys(codon_dictionary,0)
|
|
63
|
|
64 for l in range(len(testseq)):
|
|
65 try:
|
|
66 codon_dictionary.get(testseq[l*3:l*3+3])
|
|
67 codon_dictionary[testseq[l*3:l*3+3]]+=1
|
|
68 except KeyError:
|
|
69 continue
|
|
70
|
|
71 scorecomp=sum(codon_dictionary.values())/(float(len(testseq)/3)*len(codon_dictionary))
|
|
72 for x in codon_dictionary.values():
|
|
73 codonscores.append(x)
|
|
74
|
|
75 if scorecomp > cutoff :
|
|
76 scoremat[m,0]=dict2[i]
|
|
77 scoremat[m,1]=round(scorecomp,7)
|
|
78 scoremat[m,2:]=codonscores
|
|
79 m+=1
|
|
80
|
|
81 scoremat_sort=scoremat[np.lexsort((scoremat[:, 1], ))]
|
|
82 scoremat_sort[0,0]="Gene_ID"
|
|
83 scoremat_sort[0,1]="Total Score"
|
|
84 scoremat_sort[0,2:]=codonlist
|
|
85 scoremat_cleaned = scoremat_sort[[i for i, x in enumerate(scoremat_sort) if x.any()]]
|
|
86
|
|
87 #4: output
|
|
88 with open(resultfile, 'w') as file_object:
|
|
89 file_object.write("DNA sequences with a higher total score than ")
|
|
90 file_object.write(str(cutoff))
|
|
91 file_object.write("\n")
|
|
92 file_object.write("\n")
|
|
93 file_object.write(str(scoremat_cleaned))
|
|
94 file_object.close() |