annotate cpt_sar_finder/file_operations.py @ 0:9f62910edcc9 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 13:15:55 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
1
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
2 def fasta_from_SAR_dict(sar_dict,fa_file):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
3 """ makes a multi fasta with candidates from SAR dictionary """
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
4 with fa_file as f:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
5 for data in sar_dict.values():
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
6 f.writelines(">{}\n".format(data["description"]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
7 f.writelines("{}\n".format(data["sequence"]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
8
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
9 def gff3_from_SAR_dict(sar_dict,gff3_file):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
10 """ make a multi gff3 with candidates from SAR dictionary """
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
11 gff3_cols = ["Seqid","Source","Type","Start","End","Score","Strand","Phase","Attributes"]
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
12 with gff3_file as f:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
13 f.writelines(f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
14 if sar_dict:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
15 #print(sar_dict)
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
16 for name, data in sar_dict.items():
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
17 min_idx = 0
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
18 f.writelines("##gff-version 3\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
19 f.writelines(f"##sequence-region {name}\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
20 n_start, n_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][4])
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
21 sar_start, sar_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][5])
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
22 c_start, c_end = split_seq_string(data["TMD_"+str(data["biggest_sar"])][min_idx][6])
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
23 f.writelines(f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n')
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
24 f.writelines(f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n')
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
25 f.writelines(f'{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n')
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
26 else:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
27 f.writelines("##gff-version 3\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
28 f.writelines(f"##sequence-region\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
29
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
30
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
31 def tab_from_SAR_dict(sar_dict,stat_file,hydrophillic_res, sar_min, sar_max):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
32 """ convert SAR dict to a dataframe """
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
33 columns = ["Name","Protein Sequence","Protein Length","SAR Length","SAR Start","Putative SAR Sequence","SAR End",[f"{res}%" for res in hydrophillic_res],"% Total","N-term Sequence","N-term net Charge"] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
34 with stat_file as f:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
35 f.writelines(f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
36 if sar_dict:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
37 #print(sar_dict)
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
38 for name, data in sar_dict.items():
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
39 for tmd_size in range(sar_max, sar_min-1, -1):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
40 if "TMD_"+str(tmd_size) in data:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
41 for each_match in data["TMD_"+str(tmd_size)]:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
42 if each_match != [""]:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
43 #print(f"{name} - {data}")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
44 #print(each_match)
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
45 #for perc in each_match[3]:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
46 # print(perc)
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
47 try:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
48 f.writelines(f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n')
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
49 except IndexError:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
50 f.writelines(f'ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n')
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
51 else:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
52 continue
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
53
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
54 def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
55 """ summary statistics from SAR finder function """
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
56 with stat_file as f:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
57 f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
58 if sar_dict:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
59 for data in sar_dict.values():
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
60 f.writelines("Protein Description and Name: {}\n".format(data["description"]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
61 f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
62 f.writelines("Protein Length: {}\n".format(data["size"]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
63 f.writelines("SAR Criteria matching region(s)\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
64 for tmd_size in range(sar_max, sar_min-1, -1):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
65 if "TMD_"+str(tmd_size) in data:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
66 f.writelines("\nSAR length of {}:\n".format(tmd_size))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
67 for each_match in data["TMD_"+str(tmd_size)]:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
68 if each_match != ['']:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
69 f.writelines("\nPotential SAR domain sequence: {}\n".format(each_match[0]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
70 f.writelines("N-term sequence: {}\n".format(each_match[1]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
71 f.writelines("N-term net charge: {}\n".format(each_match[2]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
72 for each_perc_calc in each_match[3]:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
73 f.writelines("Percent {} content: {}%\n".format(each_perc_calc[0],each_perc_calc[1]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
74 f.writelines("N-term coords: {}\n".format(each_match[4]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
75 f.writelines("SAR coords: {}\n".format(each_match[5]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
76 f.writelines("C-term coords: {}\n".format(each_match[6]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
77 f.writelines("SAR start: {}\n".format(each_match[7]))
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
78 else:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
79 continue
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
80 f.writelines("========================================================\n\n")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
81 else:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
82 f.writelines("No candidate SAR Proteins found")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
83
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
84 def split_seq_string(input_range, python_indexing=True):
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
85 """ splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps """
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
86 if python_indexing:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
87 values = input_range.split("..")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
88 start =int(values[0]) + 1
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
89 end = int(values[1]) + 1
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
90 else:
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
91 values = input_range.split("..")
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
92 start = values[0]
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
93 end = values[1]
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
94
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
95 return start, end
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
96
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
97 if __name__ == "__main__":
9f62910edcc9 Uploaded
cpt
parents:
diff changeset
98 pass