annotate peptide_genomic_coordinate.py @ 1:cb0378d2d487 draft default tip

"planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
author galaxyp
date Sun, 14 Mar 2021 03:01:11 +0000
parents 5f49ffce52cb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
1 #!/usr/bin/env python
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
2 #
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
3 # Author: Praveen Kumar
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
4 # University of Minnesota
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
5 #
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
6 # Get peptide's genomic coordinate from the protein's genomic mapping sqlite file
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
7 # (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
8 #
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
9 # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed>
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
10 #
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
11 import argparse
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
12 import sqlite3
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
13 import sys
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
14
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
15
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
16 pep_stmt = """\
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
17 SELECT dBSequence_ref, start, end, peptide_ref \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
18 FROM peptide_evidence e JOIN peptides p on e.peptide_ref = p.id \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
19 WHERE isDecoy = 'false' AND p.sequence = ?\
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
20 """
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
21
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
22 map_stmt = """
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
23 SELECT name, chrom, start, end, strand, cds_start, cds_end \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
24 FROM feature_cds_map \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
25 WHERE name = ? \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
26 AND cds_end >= ? AND cds_start <= ? \
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
27 ORDER BY cds_start\
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
28 """
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
29
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
30
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
31 def main():
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
32 parser = argparse.ArgumentParser(description='BED file for peptides')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
33 parser.add_argument('peptides_file',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
34 metavar='peptides.tabular',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
35 type=argparse.FileType('r'),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
36 help='List of peptides, one per line')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
37 parser.add_argument('mz_to_sqlite',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
38 metavar='mz.sqlite',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
39 help='mz_to_sqlite sqlite database')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
40 parser.add_argument('genome_mapping',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
41 metavar='genome_mapping.sqlite',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
42 help='genome_mapping sqlite database')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
43 parser.add_argument('bed_file',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
44 metavar='peptides.bed',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
45 type=argparse.FileType('w'),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
46 help='BED file of peptide genomic locations')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
47 parser.add_argument('-a', '--accession',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
48 action='store_true',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
49 help='Append the accession to the peptide for BED name')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
50 parser.add_argument('-d', '--debug',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
51 action='store_true',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
52 help='Debug')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
53 args = parser.parse_args()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
54
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
55 pconn = sqlite3.connect(args.mz_to_sqlite)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
56 pc = pconn.cursor()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
57 mconn = sqlite3.connect(args.genome_mapping)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
58 mc = mconn.cursor()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
59 outfh = args.bed_file
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
60 pepfile = args.peptides_file
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
61 for seq in pepfile.readlines():
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
62 seq = seq.strip()
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
63 pc.execute(pep_stmt, (seq,))
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
64 pep_refs = pc.fetchall()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
65 for pep_ref in pep_refs:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
66 (acc, pep_start, pep_end, pep_seq) = pep_ref
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
67 cds_start = (pep_start - 1) * 3
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
68 cds_end = pep_end * 3
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
69 if args.debug:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
70 print('%s\t%s\t%s\t%d\t%d' % (acc, pep_start, pep_end, cds_start, cds_end), file=sys.stdout)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
71 mc.execute(map_stmt, (acc, cds_start, cds_end))
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
72 exons = mc.fetchall()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
73 if args.debug:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
74 print('\n'.join([str(e) for e in exons]), file=sys.stdout)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
75 if exons:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
76 chrom = exons[0][1]
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
77 strand = exons[0][4]
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
78 if strand == '+':
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
79 start = exons[0][2] + cds_start
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
80 end = exons[-1][2] + cds_end - exons[-1][5]
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
81 blk_start = []
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
82 blk_size = []
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
83 for exon in exons:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
84 offset = cds_start if cds_start > exon[5] else 0
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
85 bstart = exon[2] + offset
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
86 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5])
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
87 if args.debug:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
88 print('bstart %d\tbsize %d\t %d' % (bstart, bsize, offset), file=sys.stdout)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
89 blk_start.append(bstart - start)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
90 blk_size.append(bsize)
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
91 else:
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
92 start = exons[-1][2] + exons[-1][6] - cds_end
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
93 end = exons[0][3] - cds_start + exons[0][5]
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
94 blk_start = []
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
95 blk_size = []
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
96 for exon in reversed(exons):
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
97 bstart = exon[2] + exon[6] - min(exon[6], cds_end)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
98 bsize = min(cds_end, exon[6]) - max(cds_start, exon[5])
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
99 # bend = exon[3] - (exon[5] - max(exon[5], cds_start))
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
100 bend = exon[3] - min(cds_start - exon[5], cds_start)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
101 bend = exon[3] - bsize
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
102 if args.debug:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
103 print('bstart %d\tbsize %d\tbend %d' % (bstart, bsize, bend), file=sys.stdout)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
104 blk_start.append(bstart - start)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
105 blk_size.append(bsize)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
106 bed_line = [str(chrom), str(start), str(end),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
107 '_'.join([seq, acc]) if args.accession else seq,
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
108 '255', strand,
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
109 str(start), str(end),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
110 '0,0,0',
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
111 str(len(blk_start)),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
112 ','.join([str(b) for b in blk_size]),
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
113 ','.join([str(b) for b in blk_start])]
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
114 if args.debug:
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
115 print('\t'.join(bed_line), file=sys.stdout)
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
116 outfh.write('\t'.join(bed_line) + '\n')
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
117 pconn.close()
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
118 mconn.close()
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
119 outfh.close()
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
120 pepfile.close()
1
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
121
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
122
cb0378d2d487 "planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
galaxyp
parents: 0
diff changeset
123 if __name__ == '__main__':
0
5f49ffce52cb planemo upload commit be7e9677908b7864ef0b965a1e219a1840eeb2ec
galaxyp
parents:
diff changeset
124 main()