1
|
1
|
|
2 #
|
|
3 # Author: Praveen Kumar
|
|
4 # University of Minnesota
|
|
5 #
|
|
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
|
|
7 #
|
|
8 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
|
|
9 #
|
|
10 #
|
|
11
|
|
12 def main():
|
|
13 import sys
|
|
14 import sqlite3
|
|
15 conn = sqlite3.connect(sys.argv[2])
|
|
16 c = conn.cursor()
|
|
17 c.execute("DROP table if exists novel")
|
|
18 conn.commit()
|
|
19 c.execute("CREATE TABLE novel(peptide text)")
|
|
20 pepfile = open(sys.argv[1],"r")
|
|
21
|
|
22 for seq in pepfile.readlines():
|
|
23 seq = seq.strip()
|
|
24 c.execute('INSERT INTO novel VALUES ("'+seq+'")')
|
|
25 conn.commit()
|
|
26
|
|
27 c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id")
|
|
28 rows = c.fetchall()
|
|
29
|
|
30 conn1 = sqlite3.connect(sys.argv[3])
|
|
31 c1 = conn1.cursor()
|
|
32
|
|
33 outfh = open(sys.argv[4], "w")
|
|
34
|
|
35 master_dict = {}
|
|
36 for each in rows:
|
|
37 peptide = each[0]
|
|
38 acc = each[1]
|
|
39 acc_seq = each[2]
|
|
40
|
|
41 c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'")
|
|
42 coordinates = c1.fetchall()
|
|
43
|
|
44 if len(coordinates) != 0:
|
|
45 pep_start = 0
|
|
46 pep_end = 0
|
|
47 flag = 0
|
|
48 splice_flag = 0
|
|
49 spliced_peptide = []
|
|
50 for each_entry in coordinates:
|
|
51 chromosome = each_entry[0]
|
|
52 start = int(each_entry[1])
|
|
53 end = int(each_entry[2])
|
|
54 strand = each_entry[4]
|
|
55 cds_start = int(each_entry[5])
|
|
56 cds_end = int(each_entry[6])
|
|
57 pep_pos_start = (acc_seq.find(peptide)*3)
|
|
58 pep_pos_end = pep_pos_start + (len(peptide)*3)
|
|
59 if (pep_pos_start >= cds_start) and (pep_pos_end <= cds_end):
|
|
60 if strand == "+":
|
|
61 pep_start = start + pep_pos_start - cds_start
|
|
62 pep_end = start + pep_pos_end - cds_start
|
|
63 pep_thick_start = 0
|
|
64 pep_thick_end = len(peptide)
|
|
65 flag == 1
|
|
66 else:
|
|
67 pep_end = end - pep_pos_start + cds_start
|
|
68 pep_start = end - pep_pos_end + cds_start
|
|
69 pep_thick_start = 0
|
|
70 pep_thick_end = len(peptide)
|
|
71 flag == 1
|
|
72 spliced_peptide = []
|
|
73 splice_flag = 0
|
|
74 else:
|
|
75 if flag == 0:
|
|
76 if strand == "+":
|
|
77 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end > cds_end):
|
|
78 pep_start = start + pep_pos_start - cds_start
|
|
79 pep_end = end
|
|
80 pep_thick_start = 0
|
|
81 pep_thick_end = (pep_end-pep_start)
|
|
82 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
|
|
83 splice_flag = splice_flag + 1
|
|
84 if splice_flag == 2:
|
|
85 flag = 1
|
|
86 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start < cds_start):
|
|
87 pep_start = start
|
|
88 pep_end = start + pep_pos_end - cds_start
|
|
89 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
|
|
90 pep_thick_end = (len(peptide)*3)
|
|
91 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
|
|
92 splice_flag = splice_flag + 1
|
|
93 if splice_flag == 2:
|
|
94 flag = 1
|
|
95 else:
|
|
96 pass
|
|
97 else:
|
|
98 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end >= cds_end):
|
|
99 pep_start = start
|
|
100 pep_end = end - pep_pos_start - cds_start
|
|
101 pep_thick_start = 0
|
|
102 pep_thick_end = (pep_end-pep_start)
|
|
103 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
|
|
104 splice_flag = splice_flag + 1
|
|
105 if splice_flag == 2:
|
|
106 flag = 1
|
|
107 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start <= cds_start):
|
|
108 pep_start = end - pep_pos_end + cds_start
|
|
109 pep_end = end
|
|
110 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
|
|
111 pep_thick_end = (len(peptide)*3)
|
|
112 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
|
|
113 splice_flag = splice_flag + 1
|
|
114 if splice_flag == 2:
|
|
115 flag = 1
|
|
116 else:
|
|
117 pass
|
|
118 if len(spliced_peptide) == 0:
|
|
119 if strand == "+":
|
2
|
120 bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
|
1
|
121 else:
|
2
|
122 bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
|
1
|
123 outfh.write("\t".join(bed_line)+"\n")
|
|
124 else:
|
|
125 if strand == "+":
|
|
126 pep_entry = spliced_peptide
|
|
127 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
|
|
128 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
|
|
129 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
|
|
130 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
|
2
|
131 bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
|
1
|
132 outfh.write("\t".join(bed_line)+"\n")
|
|
133 else:
|
|
134 pep_entry = spliced_peptide
|
|
135 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
|
|
136 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
|
|
137 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
|
|
138 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
|
2
|
139 bed_line = [str(chromosome), str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
|
1
|
140 outfh.write("\t".join(bed_line)+"\n")
|
|
141 c.execute("DROP table novel")
|
|
142 conn.commit()
|
|
143 conn.close()
|
|
144 conn1.close()
|
|
145 outfh.close()
|
|
146 pepfile.close()
|
|
147
|
|
148 return None
|
|
149 if __name__ == "__main__":
|
|
150 main()
|