annotate cpt_read_garnier/reading_garnier_output.py @ 0:0d2226e1c5f6 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 13:12:20 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
2
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
3 import csv
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
4 import argparse
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
5
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
6 # import sys
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
7
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
8 # This function reads through the tagseq file and outputs a list of sequence names and the lengths of each sequence.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
9 def garnier_sequences(tagseq_file=None):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
10 # open the file and create blank lists
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
11 f = tagseq_file # open(tagseq_file, 'r')
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
12 f.seek(0)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
13 sequence = []
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
14 lengths = []
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
15
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
16 # for each line the in file, search for the words 'Sequence' and 'to' to find the sequence name and length,
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
17 # respectively. Then add sequence names and lengths to the proper lists
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
18 for line in f:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
19 words = line.split()
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
20 if line.startswith("# Sequence:"):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
21 # if 'Sequence:' in line:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
22 # if words[1] == 'Sequence:':
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
23 sequence += [words[words.index("Sequence:") + 1]]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
24 # if words[5] == 'to:':
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
25 # lengths += [int(words[6])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
26 if words.index("to:"):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
27 lengths += [int(words[words.index("to:") + 1])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
28 # return the sequence names and lengths
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
29 return sequence, lengths
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
30
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
31
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
32 # This function extracts the helix, sheet, turn, and coil predictions from the file. The predictions for each type of
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
33 # secondary structure are joined together in one string.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
34 def garnier_secondary_struct(tagseq_file=None):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
35 # opens the file and sets variables for the structural predictions
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
36 f = tagseq_file # open(tagseq_file, 'r')
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
37 helix = ""
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
38 turns = ""
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
39 coil = ""
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
40 sheet = ""
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
41
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
42 # if the first work in the line indicates a structural prediction, it adds the rest of the line to the right
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
43 # prediction string.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
44 for line in f:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
45 words = line.split()
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
46 if len(words) > 0:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
47 if words[0] in "helix":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
48 helix += str(line[6:]).rstrip("\n")
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
49 elif words[0] in "sheet":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
50 sheet += str(line[6:]).rstrip("\n")
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
51 elif words[0] in "turns":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
52 turns += str(line[6:]).rstrip("\n")
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
53 elif words[0] in "coil":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
54 coil += str(line[6:]).rstrip("\n")
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
55 # f.close()
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
56 # returns the four structural prediction strings
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
57 return helix, turns, coil, sheet
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
58
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
59
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
60 # This functions cuts the strings based on the lengths of the original sequences. Lengths are given in a list.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
61 def vector_cutter(vector, lengths_to_cut):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
62 # sets up iteration variables
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
63 start = 0
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
64 end = lengths_to_cut[0]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
65 maximum = len(lengths_to_cut)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
66 # creates output list
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
67 output = []
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
68 # loops through the number of sequences based on the number of lengths
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
69 for i in range(maximum):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
70 # outputs list of sequence strings
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
71 output += [str(vector[start:end])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
72 start = end
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
73 if i + 1 != maximum:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
74 end += lengths_to_cut[i + 1]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
75 # returns list of strings. Each sequence has a string included in the list.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
76 return output
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
77
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
78
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
79 # this function takes the helix, turn, sheet, and coil predictions for each sequence and creates a single structural
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
80 # prediction string.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
81 def single_prediction(helix, sheet, turns, coil):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
82 # sets output list
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
83 secondary_structure = []
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
84 # checks to make sure each of the strings is the same length
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
85 if len(helix) == len(sheet) == len(coil) == len(turns):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
86 # loops through the length of each sequence, and when the value is not a blank it is added to the output
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
87 # prediction list.
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
88 for j in range(len(helix)):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
89 if helix[j] != " ":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
90 secondary_structure += [str(helix[j])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
91 elif sheet[j] != " ":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
92 secondary_structure += [str(sheet[j])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
93 elif coil[j] != " ":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
94 secondary_structure += [str(coil[j])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
95 else:
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
96 secondary_structure += [str(turns[j])]
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
97 # returns the output prediction list for the sequence
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
98 return secondary_structure
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
99
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
100
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
101 if __name__ == "__main__":
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
102 # Grab all of the filters from our plugin loader
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
103 parser = argparse.ArgumentParser(
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
104 description="Read Garnier Secondary Structure Prediction"
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
105 )
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
106 parser.add_argument(
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
107 "tagseq_file", type=argparse.FileType("r"), help="Tagseq file input"
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
108 )
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
109 args = parser.parse_args()
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
110
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
111 # opens the tagseq file and prepares for writing csv
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
112 # f = open(sys.stdout, 'w', newline='')
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
113 # writer = csv.writer(f)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
114
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
115 # reads tagseq file for helix, turn, coil, and sheet sequences as well as for names and lengths of the sequences
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
116 # summarized in the tagseq file#!/usr/bin/env python\r
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
117 Hel, Tur, Coi, She = garnier_secondary_struct(**vars(args))
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
118 names, gives = garnier_sequences(**vars(args))
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
119
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
120 # cut each of the structural prediction strings so that they are individual sequences
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
121 Helix = vector_cutter(Hel, gives)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
122 Sheet = vector_cutter(She, gives)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
123 Turns = vector_cutter(Tur, gives)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
124 Coil = vector_cutter(Coi, gives)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
125
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
126 # for each sequence compile the four types of structural predictions into a single prediction, and output the final
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
127 # prediction in csv format and to the screen
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
128 for i in range(len(Helix)):
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
129 Final = single_prediction(Helix[i], Sheet[i], Turns[i], Coil[i])
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
130 # csv.writerow(['Sequence: '] + [names[i]])
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
131 # csv.writerow(Final)
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
132 print("Sequence Name: " + "\t" + names[i])
0d2226e1c5f6 Uploaded
cpt
parents:
diff changeset
133 print("\t".join(Final))