0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import csv
|
|
4 import argparse
|
|
5
|
|
6 # import sys
|
|
7
|
|
8 # This function reads through the tagseq file and outputs a list of sequence names and the lengths of each sequence.
|
|
9 def garnier_sequences(tagseq_file=None):
|
|
10 # open the file and create blank lists
|
|
11 f = tagseq_file # open(tagseq_file, 'r')
|
|
12 f.seek(0)
|
|
13 sequence = []
|
|
14 lengths = []
|
|
15
|
|
16 # for each line the in file, search for the words 'Sequence' and 'to' to find the sequence name and length,
|
|
17 # respectively. Then add sequence names and lengths to the proper lists
|
|
18 for line in f:
|
|
19 words = line.split()
|
|
20 if line.startswith("# Sequence:"):
|
|
21 # if 'Sequence:' in line:
|
|
22 # if words[1] == 'Sequence:':
|
|
23 sequence += [words[words.index("Sequence:") + 1]]
|
|
24 # if words[5] == 'to:':
|
|
25 # lengths += [int(words[6])]
|
|
26 if words.index("to:"):
|
|
27 lengths += [int(words[words.index("to:") + 1])]
|
|
28 # return the sequence names and lengths
|
|
29 return sequence, lengths
|
|
30
|
|
31
|
|
32 # This function extracts the helix, sheet, turn, and coil predictions from the file. The predictions for each type of
|
|
33 # secondary structure are joined together in one string.
|
|
34 def garnier_secondary_struct(tagseq_file=None):
|
|
35 # opens the file and sets variables for the structural predictions
|
|
36 f = tagseq_file # open(tagseq_file, 'r')
|
|
37 helix = ""
|
|
38 turns = ""
|
|
39 coil = ""
|
|
40 sheet = ""
|
|
41
|
|
42 # if the first work in the line indicates a structural prediction, it adds the rest of the line to the right
|
|
43 # prediction string.
|
|
44 for line in f:
|
|
45 words = line.split()
|
|
46 if len(words) > 0:
|
|
47 if words[0] in "helix":
|
|
48 helix += str(line[6:]).rstrip("\n")
|
|
49 elif words[0] in "sheet":
|
|
50 sheet += str(line[6:]).rstrip("\n")
|
|
51 elif words[0] in "turns":
|
|
52 turns += str(line[6:]).rstrip("\n")
|
|
53 elif words[0] in "coil":
|
|
54 coil += str(line[6:]).rstrip("\n")
|
|
55 # f.close()
|
|
56 # returns the four structural prediction strings
|
|
57 return helix, turns, coil, sheet
|
|
58
|
|
59
|
|
60 # This functions cuts the strings based on the lengths of the original sequences. Lengths are given in a list.
|
|
61 def vector_cutter(vector, lengths_to_cut):
|
|
62 # sets up iteration variables
|
|
63 start = 0
|
|
64 end = lengths_to_cut[0]
|
|
65 maximum = len(lengths_to_cut)
|
|
66 # creates output list
|
|
67 output = []
|
|
68 # loops through the number of sequences based on the number of lengths
|
|
69 for i in range(maximum):
|
|
70 # outputs list of sequence strings
|
|
71 output += [str(vector[start:end])]
|
|
72 start = end
|
|
73 if i + 1 != maximum:
|
|
74 end += lengths_to_cut[i + 1]
|
|
75 # returns list of strings. Each sequence has a string included in the list.
|
|
76 return output
|
|
77
|
|
78
|
|
79 # this function takes the helix, turn, sheet, and coil predictions for each sequence and creates a single structural
|
|
80 # prediction string.
|
|
81 def single_prediction(helix, sheet, turns, coil):
|
|
82 # sets output list
|
|
83 secondary_structure = []
|
|
84 # checks to make sure each of the strings is the same length
|
|
85 if len(helix) == len(sheet) == len(coil) == len(turns):
|
|
86 # loops through the length of each sequence, and when the value is not a blank it is added to the output
|
|
87 # prediction list.
|
|
88 for j in range(len(helix)):
|
|
89 if helix[j] != " ":
|
|
90 secondary_structure += [str(helix[j])]
|
|
91 elif sheet[j] != " ":
|
|
92 secondary_structure += [str(sheet[j])]
|
|
93 elif coil[j] != " ":
|
|
94 secondary_structure += [str(coil[j])]
|
|
95 else:
|
|
96 secondary_structure += [str(turns[j])]
|
|
97 # returns the output prediction list for the sequence
|
|
98 return secondary_structure
|
|
99
|
|
100
|
|
101 if __name__ == "__main__":
|
|
102 # Grab all of the filters from our plugin loader
|
|
103 parser = argparse.ArgumentParser(
|
|
104 description="Read Garnier Secondary Structure Prediction"
|
|
105 )
|
|
106 parser.add_argument(
|
|
107 "tagseq_file", type=argparse.FileType("r"), help="Tagseq file input"
|
|
108 )
|
|
109 args = parser.parse_args()
|
|
110
|
|
111 # opens the tagseq file and prepares for writing csv
|
|
112 # f = open(sys.stdout, 'w', newline='')
|
|
113 # writer = csv.writer(f)
|
|
114
|
|
115 # reads tagseq file for helix, turn, coil, and sheet sequences as well as for names and lengths of the sequences
|
|
116 # summarized in the tagseq file#!/usr/bin/env python\r
|
|
117 Hel, Tur, Coi, She = garnier_secondary_struct(**vars(args))
|
|
118 names, gives = garnier_sequences(**vars(args))
|
|
119
|
|
120 # cut each of the structural prediction strings so that they are individual sequences
|
|
121 Helix = vector_cutter(Hel, gives)
|
|
122 Sheet = vector_cutter(She, gives)
|
|
123 Turns = vector_cutter(Tur, gives)
|
|
124 Coil = vector_cutter(Coi, gives)
|
|
125
|
|
126 # for each sequence compile the four types of structural predictions into a single prediction, and output the final
|
|
127 # prediction in csv format and to the screen
|
|
128 for i in range(len(Helix)):
|
|
129 Final = single_prediction(Helix[i], Sheet[i], Turns[i], Coil[i])
|
|
130 # csv.writerow(['Sequence: '] + [names[i]])
|
|
131 # csv.writerow(Final)
|
|
132 print("Sequence Name: " + "\t" + names[i])
|
|
133 print("\t".join(Final))
|