comparison peptideGenomicCoordinate.py @ 1:561b62936e2c draft

planemo upload
author pravs
date Mon, 09 Apr 2018 16:53:05 -0400
parents
children 1b61f7aafb61
comparison
equal deleted inserted replaced
0:58c4333afe1f 1:561b62936e2c
1
2 #
3 # Author: Praveen Kumar
4 # University of Minnesota
5 #
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
7 #
8 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
9 #
10 #
11
12 def main():
13 import sys
14 import sqlite3
15 conn = sqlite3.connect(sys.argv[2])
16 c = conn.cursor()
17 c.execute("DROP table if exists novel")
18 conn.commit()
19 c.execute("CREATE TABLE novel(peptide text)")
20 pepfile = open(sys.argv[1],"r")
21
22 for seq in pepfile.readlines():
23 seq = seq.strip()
24 c.execute('INSERT INTO novel VALUES ("'+seq+'")')
25 conn.commit()
26
27 c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id")
28 rows = c.fetchall()
29
30 conn1 = sqlite3.connect(sys.argv[3])
31 c1 = conn1.cursor()
32
33 outfh = open(sys.argv[4], "w")
34
35 master_dict = {}
36 for each in rows:
37 peptide = each[0]
38 acc = each[1]
39 acc_seq = each[2]
40
41 c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'")
42 coordinates = c1.fetchall()
43
44 if len(coordinates) != 0:
45 pep_start = 0
46 pep_end = 0
47 flag = 0
48 splice_flag = 0
49 spliced_peptide = []
50 for each_entry in coordinates:
51 chromosome = each_entry[0]
52 start = int(each_entry[1])
53 end = int(each_entry[2])
54 strand = each_entry[4]
55 cds_start = int(each_entry[5])
56 cds_end = int(each_entry[6])
57 pep_pos_start = (acc_seq.find(peptide)*3)
58 pep_pos_end = pep_pos_start + (len(peptide)*3)
59 if (pep_pos_start >= cds_start) and (pep_pos_end <= cds_end):
60 if strand == "+":
61 pep_start = start + pep_pos_start - cds_start
62 pep_end = start + pep_pos_end - cds_start
63 pep_thick_start = 0
64 pep_thick_end = len(peptide)
65 flag == 1
66 else:
67 pep_end = end - pep_pos_start + cds_start
68 pep_start = end - pep_pos_end + cds_start
69 pep_thick_start = 0
70 pep_thick_end = len(peptide)
71 flag == 1
72 spliced_peptide = []
73 splice_flag = 0
74 else:
75 if flag == 0:
76 if strand == "+":
77 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end > cds_end):
78 pep_start = start + pep_pos_start - cds_start
79 pep_end = end
80 pep_thick_start = 0
81 pep_thick_end = (pep_end-pep_start)
82 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
83 splice_flag = splice_flag + 1
84 if splice_flag == 2:
85 flag = 1
86 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start < cds_start):
87 pep_start = start
88 pep_end = start + pep_pos_end - cds_start
89 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
90 pep_thick_end = (len(peptide)*3)
91 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
92 splice_flag = splice_flag + 1
93 if splice_flag == 2:
94 flag = 1
95 else:
96 pass
97 else:
98 if (pep_pos_start >= cds_start) and (pep_pos_start <= cds_end) and (pep_pos_end >= cds_end):
99 pep_start = start
100 pep_end = end - pep_pos_start - cds_start
101 pep_thick_start = 0
102 pep_thick_end = (pep_end-pep_start)
103 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
104 splice_flag = splice_flag + 1
105 if splice_flag == 2:
106 flag = 1
107 elif (pep_pos_end >= cds_start) and (pep_pos_end <= cds_end) and (pep_pos_start <= cds_start):
108 pep_start = end - pep_pos_end + cds_start
109 pep_end = end
110 pep_thick_start = (len(peptide)*3)-(pep_end-pep_start)
111 pep_thick_end = (len(peptide)*3)
112 spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end])
113 splice_flag = splice_flag + 1
114 if splice_flag == 2:
115 flag = 1
116 else:
117 pass
118 if len(spliced_peptide) == 0:
119 if strand == "+":
120 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
121 else:
122 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"]
123 outfh.write("\t".join(bed_line)+"\n")
124 else:
125 if strand == "+":
126 pep_entry = spliced_peptide
127 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
128 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
129 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
130 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
131 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
132 outfh.write("\t".join(bed_line)+"\n")
133 else:
134 pep_entry = spliced_peptide
135 pep_start = min([pep_entry[0][0], pep_entry[1][0]])
136 pep_end = max([pep_entry[0][1], pep_entry[1][1]])
137 blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))]
138 blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))]
139 bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)]
140 outfh.write("\t".join(bed_line)+"\n")
141 c.execute("DROP table novel")
142 conn.commit()
143 conn.close()
144 conn1.close()
145 outfh.close()
146 pepfile.close()
147
148 return None
149 if __name__ == "__main__":
150 main()