annotate cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py @ 0:f3fc78cc4c43 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:33:22 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
1 """
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
2 This program is intended to create the output table for the disruptin finder workflow
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
3 """
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
4 from Bio import SeqIO
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
5 from Bio.SeqUtils.ProtParam import ProteinAnalysis
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
6 from Bio.SeqUtils import ProtParamData
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
7 import csv
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
8 import argparse
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
9 import sys
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
10
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
11
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
12 def disruptin_table(garnier_file, fasta_file):
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
13 # Iterable variables
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
14 position = 1
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
15 net_charge = 0
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
16 charge_res = 0
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
17 record_number = 0
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
18
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
19 # loop structures
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
20 names = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
21 sec_struct = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
22
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
23 # reading the lines from the garnier csv file
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
24 # with open(garnier_file,'r') as csvfile:
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
25 # garnierreader = csv.reader(csvfile)
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
26 for row in garnier_file:
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
27 if row[0] == 'Sequence: ':
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
28 names += [row[1]]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
29 elif row[0] in 'HETC':
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
30 row = row.split('\t')
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
31 sec_struct += [''.join(row)]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
32
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
33 record = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
34 p = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
35 r = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
36 c = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
37 h = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
38 s = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
39
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
40 # Parse the .fasta file and get the sequence
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
41 for rec in SeqIO.parse(fasta_file, "fasta"):
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
42 sequence = str(rec.seq)
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
43
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
44 # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
45 # structure
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
46 record += [rec.id]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
47 position_vec = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
48 residue_vec = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
49 charge_sym_vec = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
50 sec_struct_vec = []
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
51
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
52 for aa in sequence:
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
53 position_vec += [str(position)]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
54 residue_vec += [str(aa)]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
55 sec_struct_vec += [str(sec_struct[record_number][position - 1])]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
56
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
57 # For R and K residues a positive charge is given
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
58 if aa in "RK":
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
59 symbol = "+"
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
60 # For D and E residues a negative charge is given
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
61 elif aa in "DE":
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
62 symbol = "-"
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
63 elif aa in "AVMILPWFG":
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
64 symbol = "N"
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
65 elif aa in "HSYTCQN":
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
66 symbol = "P"
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
67 charge_sym_vec += symbol
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
68 position += 1
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
69
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
70 # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
71 # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
72 # approximate position on the sequence.
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
73 prot_ana_seq = ProteinAnalysis(sequence)
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
74 hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
75
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
76 record_number += 1
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
77 position = 1
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
78
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
79 p += [position_vec]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
80 r += [residue_vec]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
81 c += [charge_sym_vec]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
82 h += [hydro]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
83 s += [sec_struct_vec]
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
84
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
85 # returns values for name of the sequence
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
86 return record, p, r, c, h, s
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
87
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
88
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
89 if __name__ == "__main__":
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
90 # Grab all of the filters from our plugin loader
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
91 parser = argparse.ArgumentParser(description="Disruptin Table Output")
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
92 parser.add_argument(
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
93 "garnier_file", type=argparse.FileType("r"), help="csv file from garnier reader"
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
94 )
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
95 parser.add_argument(
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
96 "fasta_file",
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
97 type=argparse.FileType("r"),
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
98 help="fasta file of disruptin candidates",
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
99 )
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
100 args = parser.parse_args()
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
101
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
102 # Set up output location
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
103 # f = open(sys.stdout, 'w', newline='')
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
104 # writer1 = csv.writer(f)
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
105
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
106 iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
107
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
108 for i in range(len(iden)):
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
109 # writer1.writerow(['Protein ID']+[iden[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
110 # writer1.writerow(['Position'] + [format(x, 's') for x in position[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
111 # writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
112 # writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
113 # writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
114 # writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
115 # writer1.writerow([''])
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
116
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
117 print(str(iden[i]))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
118 print("Position \t " + "\t".join(position[i]))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
119 print("Residue \t" + "\t".join(residue[i]))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
120 print("Charge \t" + "\t".join(charge[i]))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
121 print("Hydrophobicity \t" + "\t".join(format(x, ".3f") for x in hydro[i]))
f3fc78cc4c43 Uploaded
cpt
parents:
diff changeset
122 print("Secondary Structure \t" + "\t".join(struct[i]))