0
|
1 """
|
|
2 This program is intended to create the output table for the disruptin finder workflow
|
|
3 """
|
|
4 from Bio import SeqIO
|
|
5 from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
|
6 from Bio.SeqUtils import ProtParamData
|
|
7 import csv
|
|
8 import argparse
|
|
9 import sys
|
|
10
|
|
11
|
|
12 def disruptin_table(garnier_file, fasta_file):
|
|
13 # Iterable variables
|
|
14 position = 1
|
|
15 net_charge = 0
|
|
16 charge_res = 0
|
|
17 record_number = 0
|
|
18
|
|
19 # loop structures
|
|
20 names = []
|
|
21 sec_struct = []
|
|
22
|
|
23 # reading the lines from the garnier csv file
|
|
24 # with open(garnier_file,'r') as csvfile:
|
|
25 # garnierreader = csv.reader(csvfile)
|
|
26 for row in garnier_file:
|
|
27 if row[0] == 'Sequence: ':
|
|
28 names += [row[1]]
|
|
29 elif row[0] in 'HETC':
|
|
30 row = row.split('\t')
|
|
31 sec_struct += [''.join(row)]
|
|
32
|
|
33 record = []
|
|
34 p = []
|
|
35 r = []
|
|
36 c = []
|
|
37 h = []
|
|
38 s = []
|
|
39
|
|
40 # Parse the .fasta file and get the sequence
|
|
41 for rec in SeqIO.parse(fasta_file, "fasta"):
|
|
42 sequence = str(rec.seq)
|
|
43
|
|
44 # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary
|
|
45 # structure
|
|
46 record += [rec.id]
|
|
47 position_vec = []
|
|
48 residue_vec = []
|
|
49 charge_sym_vec = []
|
|
50 sec_struct_vec = []
|
|
51
|
|
52 for aa in sequence:
|
|
53 position_vec += [str(position)]
|
|
54 residue_vec += [str(aa)]
|
|
55 sec_struct_vec += [str(sec_struct[record_number][position - 1])]
|
|
56
|
|
57 # For R and K residues a positive charge is given
|
|
58 if aa in "RK":
|
|
59 symbol = "+"
|
|
60 # For D and E residues a negative charge is given
|
|
61 elif aa in "DE":
|
|
62 symbol = "-"
|
|
63 elif aa in "AVMILPWFG":
|
|
64 symbol = "N"
|
|
65 elif aa in "HSYTCQN":
|
|
66 symbol = "P"
|
|
67 charge_sym_vec += symbol
|
|
68 position += 1
|
|
69
|
|
70 # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning
|
|
71 # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their
|
|
72 # approximate position on the sequence.
|
|
73 prot_ana_seq = ProteinAnalysis(sequence)
|
|
74 hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4
|
|
75
|
|
76 record_number += 1
|
|
77 position = 1
|
|
78
|
|
79 p += [position_vec]
|
|
80 r += [residue_vec]
|
|
81 c += [charge_sym_vec]
|
|
82 h += [hydro]
|
|
83 s += [sec_struct_vec]
|
|
84
|
|
85 # returns values for name of the sequence
|
|
86 return record, p, r, c, h, s
|
|
87
|
|
88
|
|
89 if __name__ == "__main__":
|
|
90 # Grab all of the filters from our plugin loader
|
|
91 parser = argparse.ArgumentParser(description="Disruptin Table Output")
|
|
92 parser.add_argument(
|
|
93 "garnier_file", type=argparse.FileType("r"), help="csv file from garnier reader"
|
|
94 )
|
|
95 parser.add_argument(
|
|
96 "fasta_file",
|
|
97 type=argparse.FileType("r"),
|
|
98 help="fasta file of disruptin candidates",
|
|
99 )
|
|
100 args = parser.parse_args()
|
|
101
|
|
102 # Set up output location
|
|
103 # f = open(sys.stdout, 'w', newline='')
|
|
104 # writer1 = csv.writer(f)
|
|
105
|
|
106 iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args))
|
|
107
|
|
108 for i in range(len(iden)):
|
|
109 # writer1.writerow(['Protein ID']+[iden[i]])
|
|
110 # writer1.writerow(['Position'] + [format(x, 's') for x in position[i]])
|
|
111 # writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]])
|
|
112 # writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]])
|
|
113 # writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]])
|
|
114 # writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]])
|
|
115 # writer1.writerow([''])
|
|
116
|
|
117 print(str(iden[i]))
|
|
118 print("Position \t " + "\t".join(position[i]))
|
|
119 print("Residue \t" + "\t".join(residue[i]))
|
|
120 print("Charge \t" + "\t".join(charge[i]))
|
|
121 print("Hydrophobicity \t" + "\t".join(format(x, ".3f") for x in hydro[i]))
|
|
122 print("Secondary Structure \t" + "\t".join(struct[i]))
|