comparison cpt_read_garnier/reading_garnier_output.py @ 0:0d2226e1c5f6 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 13:12:20 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0d2226e1c5f6
1 #!/usr/bin/env python
2
3 import csv
4 import argparse
5
6 # import sys
7
8 # This function reads through the tagseq file and outputs a list of sequence names and the lengths of each sequence.
9 def garnier_sequences(tagseq_file=None):
10 # open the file and create blank lists
11 f = tagseq_file # open(tagseq_file, 'r')
12 f.seek(0)
13 sequence = []
14 lengths = []
15
16 # for each line the in file, search for the words 'Sequence' and 'to' to find the sequence name and length,
17 # respectively. Then add sequence names and lengths to the proper lists
18 for line in f:
19 words = line.split()
20 if line.startswith("# Sequence:"):
21 # if 'Sequence:' in line:
22 # if words[1] == 'Sequence:':
23 sequence += [words[words.index("Sequence:") + 1]]
24 # if words[5] == 'to:':
25 # lengths += [int(words[6])]
26 if words.index("to:"):
27 lengths += [int(words[words.index("to:") + 1])]
28 # return the sequence names and lengths
29 return sequence, lengths
30
31
32 # This function extracts the helix, sheet, turn, and coil predictions from the file. The predictions for each type of
33 # secondary structure are joined together in one string.
34 def garnier_secondary_struct(tagseq_file=None):
35 # opens the file and sets variables for the structural predictions
36 f = tagseq_file # open(tagseq_file, 'r')
37 helix = ""
38 turns = ""
39 coil = ""
40 sheet = ""
41
42 # if the first work in the line indicates a structural prediction, it adds the rest of the line to the right
43 # prediction string.
44 for line in f:
45 words = line.split()
46 if len(words) > 0:
47 if words[0] in "helix":
48 helix += str(line[6:]).rstrip("\n")
49 elif words[0] in "sheet":
50 sheet += str(line[6:]).rstrip("\n")
51 elif words[0] in "turns":
52 turns += str(line[6:]).rstrip("\n")
53 elif words[0] in "coil":
54 coil += str(line[6:]).rstrip("\n")
55 # f.close()
56 # returns the four structural prediction strings
57 return helix, turns, coil, sheet
58
59
60 # This functions cuts the strings based on the lengths of the original sequences. Lengths are given in a list.
61 def vector_cutter(vector, lengths_to_cut):
62 # sets up iteration variables
63 start = 0
64 end = lengths_to_cut[0]
65 maximum = len(lengths_to_cut)
66 # creates output list
67 output = []
68 # loops through the number of sequences based on the number of lengths
69 for i in range(maximum):
70 # outputs list of sequence strings
71 output += [str(vector[start:end])]
72 start = end
73 if i + 1 != maximum:
74 end += lengths_to_cut[i + 1]
75 # returns list of strings. Each sequence has a string included in the list.
76 return output
77
78
79 # this function takes the helix, turn, sheet, and coil predictions for each sequence and creates a single structural
80 # prediction string.
81 def single_prediction(helix, sheet, turns, coil):
82 # sets output list
83 secondary_structure = []
84 # checks to make sure each of the strings is the same length
85 if len(helix) == len(sheet) == len(coil) == len(turns):
86 # loops through the length of each sequence, and when the value is not a blank it is added to the output
87 # prediction list.
88 for j in range(len(helix)):
89 if helix[j] != " ":
90 secondary_structure += [str(helix[j])]
91 elif sheet[j] != " ":
92 secondary_structure += [str(sheet[j])]
93 elif coil[j] != " ":
94 secondary_structure += [str(coil[j])]
95 else:
96 secondary_structure += [str(turns[j])]
97 # returns the output prediction list for the sequence
98 return secondary_structure
99
100
101 if __name__ == "__main__":
102 # Grab all of the filters from our plugin loader
103 parser = argparse.ArgumentParser(
104 description="Read Garnier Secondary Structure Prediction"
105 )
106 parser.add_argument(
107 "tagseq_file", type=argparse.FileType("r"), help="Tagseq file input"
108 )
109 args = parser.parse_args()
110
111 # opens the tagseq file and prepares for writing csv
112 # f = open(sys.stdout, 'w', newline='')
113 # writer = csv.writer(f)
114
115 # reads tagseq file for helix, turn, coil, and sheet sequences as well as for names and lengths of the sequences
116 # summarized in the tagseq file#!/usr/bin/env python\r
117 Hel, Tur, Coi, She = garnier_secondary_struct(**vars(args))
118 names, gives = garnier_sequences(**vars(args))
119
120 # cut each of the structural prediction strings so that they are individual sequences
121 Helix = vector_cutter(Hel, gives)
122 Sheet = vector_cutter(She, gives)
123 Turns = vector_cutter(Tur, gives)
124 Coil = vector_cutter(Coi, gives)
125
126 # for each sequence compile the four types of structural predictions into a single prediction, and output the final
127 # prediction in csv format and to the screen
128 for i in range(len(Helix)):
129 Final = single_prediction(Helix[i], Sheet[i], Turns[i], Coil[i])
130 # csv.writerow(['Sequence: '] + [names[i]])
131 # csv.writerow(Final)
132 print("Sequence Name: " + "\t" + names[i])
133 print("\t".join(Final))