Mercurial > repos > cpt > cpt_read_garnier
comparison reading_garnier_output.py @ 1:edd518e72c89 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:52:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:0d2226e1c5f6 | 1:edd518e72c89 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import csv | |
4 import argparse | |
5 | |
6 # import sys | |
7 | |
8 # This function reads through the tagseq file and outputs a list of sequence names and the lengths of each sequence. | |
9 def garnier_sequences(tagseq_file=None): | |
10 # open the file and create blank lists | |
11 f = tagseq_file # open(tagseq_file, 'r') | |
12 f.seek(0) | |
13 sequence = [] | |
14 lengths = [] | |
15 | |
16 # for each line the in file, search for the words 'Sequence' and 'to' to find the sequence name and length, | |
17 # respectively. Then add sequence names and lengths to the proper lists | |
18 for line in f: | |
19 words = line.split() | |
20 if line.startswith("# Sequence:"): | |
21 # if 'Sequence:' in line: | |
22 # if words[1] == 'Sequence:': | |
23 sequence += [words[words.index("Sequence:") + 1]] | |
24 # if words[5] == 'to:': | |
25 # lengths += [int(words[6])] | |
26 if words.index("to:"): | |
27 lengths += [int(words[words.index("to:") + 1])] | |
28 # return the sequence names and lengths | |
29 return sequence, lengths | |
30 | |
31 | |
32 # This function extracts the helix, sheet, turn, and coil predictions from the file. The predictions for each type of | |
33 # secondary structure are joined together in one string. | |
34 def garnier_secondary_struct(tagseq_file=None): | |
35 # opens the file and sets variables for the structural predictions | |
36 f = tagseq_file # open(tagseq_file, 'r') | |
37 helix = "" | |
38 turns = "" | |
39 coil = "" | |
40 sheet = "" | |
41 | |
42 # if the first work in the line indicates a structural prediction, it adds the rest of the line to the right | |
43 # prediction string. | |
44 for line in f: | |
45 words = line.split() | |
46 if len(words) > 0: | |
47 if words[0] in "helix": | |
48 helix += str(line[6:]).rstrip("\n") | |
49 elif words[0] in "sheet": | |
50 sheet += str(line[6:]).rstrip("\n") | |
51 elif words[0] in "turns": | |
52 turns += str(line[6:]).rstrip("\n") | |
53 elif words[0] in "coil": | |
54 coil += str(line[6:]).rstrip("\n") | |
55 # f.close() | |
56 # returns the four structural prediction strings | |
57 return helix, turns, coil, sheet | |
58 | |
59 | |
60 # This functions cuts the strings based on the lengths of the original sequences. Lengths are given in a list. | |
61 def vector_cutter(vector, lengths_to_cut): | |
62 # sets up iteration variables | |
63 start = 0 | |
64 end = lengths_to_cut[0] | |
65 maximum = len(lengths_to_cut) | |
66 # creates output list | |
67 output = [] | |
68 # loops through the number of sequences based on the number of lengths | |
69 for i in range(maximum): | |
70 # outputs list of sequence strings | |
71 output += [str(vector[start:end])] | |
72 start = end | |
73 if i + 1 != maximum: | |
74 end += lengths_to_cut[i + 1] | |
75 # returns list of strings. Each sequence has a string included in the list. | |
76 return output | |
77 | |
78 | |
79 # this function takes the helix, turn, sheet, and coil predictions for each sequence and creates a single structural | |
80 # prediction string. | |
81 def single_prediction(helix, sheet, turns, coil): | |
82 # sets output list | |
83 secondary_structure = [] | |
84 # checks to make sure each of the strings is the same length | |
85 if len(helix) == len(sheet) == len(coil) == len(turns): | |
86 # loops through the length of each sequence, and when the value is not a blank it is added to the output | |
87 # prediction list. | |
88 for j in range(len(helix)): | |
89 if helix[j] != " ": | |
90 secondary_structure += [str(helix[j])] | |
91 elif sheet[j] != " ": | |
92 secondary_structure += [str(sheet[j])] | |
93 elif coil[j] != " ": | |
94 secondary_structure += [str(coil[j])] | |
95 else: | |
96 secondary_structure += [str(turns[j])] | |
97 # returns the output prediction list for the sequence | |
98 return secondary_structure | |
99 | |
100 | |
101 if __name__ == "__main__": | |
102 # Grab all of the filters from our plugin loader | |
103 parser = argparse.ArgumentParser( | |
104 description="Read Garnier Secondary Structure Prediction" | |
105 ) | |
106 parser.add_argument( | |
107 "tagseq_file", type=argparse.FileType("r"), help="Tagseq file input" | |
108 ) | |
109 args = parser.parse_args() | |
110 | |
111 # opens the tagseq file and prepares for writing csv | |
112 # f = open(sys.stdout, 'w', newline='') | |
113 # writer = csv.writer(f) | |
114 | |
115 # reads tagseq file for helix, turn, coil, and sheet sequences as well as for names and lengths of the sequences | |
116 # summarized in the tagseq file#!/usr/bin/env python\r | |
117 Hel, Tur, Coi, She = garnier_secondary_struct(**vars(args)) | |
118 names, gives = garnier_sequences(**vars(args)) | |
119 | |
120 # cut each of the structural prediction strings so that they are individual sequences | |
121 Helix = vector_cutter(Hel, gives) | |
122 Sheet = vector_cutter(She, gives) | |
123 Turns = vector_cutter(Tur, gives) | |
124 Coil = vector_cutter(Coi, gives) | |
125 | |
126 # for each sequence compile the four types of structural predictions into a single prediction, and output the final | |
127 # prediction in csv format and to the screen | |
128 for i in range(len(Helix)): | |
129 Final = single_prediction(Helix[i], Sheet[i], Turns[i], Coil[i]) | |
130 # csv.writerow(['Sequence: '] + [names[i]]) | |
131 # csv.writerow(Final) | |
132 print("Sequence Name: " + "\t" + names[i]) | |
133 print("\t".join(Final)) |